diff --git a/CHANGES.next.md b/CHANGES.next.md
index 74624d26d5..1481579055 100644
--- a/CHANGES.next.md
+++ b/CHANGES.next.md
@@ -85,6 +85,7 @@
 -   Add dpb_sparksql_serverless_benchmark, which submits one job for each
     TPC-DS/H query and measures the whole job execution time, instead of only
     the query run time.
+    Add Intel MPI benchmark.
 
 ### Enhancements:
 
diff --git a/perfkitbenchmarker/linux_benchmarks/mpi_benchmark.py b/perfkitbenchmarker/linux_benchmarks/mpi_benchmark.py
new file mode 100644
index 0000000000..c809fc42fa
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/mpi_benchmark.py
@@ -0,0 +1,427 @@
+"""MPI benchmarking tests.
+
+This could go to the public PKB once we have a handle on the metrics and if
+there should be tuning on each of the clouds
+"""
+
+import logging
+from typing import Any, Dict, Iterator, List, Tuple
+from absl import flags
+
+from perfkitbenchmarker import benchmark_spec
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import flag_util
+from perfkitbenchmarker import linux_virtual_machine
+from perfkitbenchmarker import sample
+from perfkitbenchmarker import vm_util
+
+from perfkitbenchmarker.linux_packages import mpi
+
+_BaseLinuxVirtualMachine = linux_virtual_machine.BaseLinuxVirtualMachine
+
+# documents the individual MPI tests in each suite
+_MPI_SUITE_TESTS = {
+    'IMB-MPI1': [
+        'Allgather', 'Allgatherv', 'Allreduce', 'Alltoall', 'Alltoallv',
+        'Barrier', 'Bcast', 'Exchange', 'Gather', 'Gatherv', 'PingPing',
+        'PingPong', 'Reduce', 'Reduce_scatter', 'Reduce_scatter_block',
+        'Scatter', 'Scatterv', 'Sendrecv'
+    ],
+    'IMB-MPI2': [],
+    'IMB-NBC': [
+        'Iallgather', 'Iallgatherv', 'Iallreduce', 'Ialltoall', 'Ialltoallv',
+        'Ibarrier', 'Ibcast', 'Igather', 'Igatherv', 'Ireduce',
+        'Ireduce_scatter', 'Iscatter', 'Iscatterv'
+    ],
+    'IMB-RMA': [
+        'Accumulate', 'All_get_all', 'All_put_all', 'Bidir_get', 'Bidir_put',
+        'Compare_and_swap', 'Exchange_get', 'Exchange_put', 'Fetch_and_op',
+        'Get_accumulate', 'One_get_all', 'One_put_all', 'Put_all_local',
+        'Put_local', 'Truly_passive_put', 'Unidir_get', 'Unidir_put'
+    ],
+    'IMB-MT': [
+        'AllReduceMT', 'BarrierMT', 'BcastMT', 'BiBandMT', 'ExchangeMT',
+        'PingPingMT', 'PingPongMT', 'ReduceMT', 'SendRecvMT', 'UniBandMT'
+    ]
+}
+
+flags.DEFINE_list('mpi_suites', ['IMB-MPI1'],
+                  'MPI benchmarks suites: {}.'.format(sorted(_MPI_SUITE_TESTS)))
+_BENCHMARKS = flags.DEFINE_list(
+    'mpi_benchmarks', [],
+    ('List of MPI benchmarks.  Default is [], which means '
+     'running all benchmarks in the suite.'))
+flag_util.DEFINE_integerlist(
+    'mpi_threads', [0, 1], 'Number of MPI processes to use per host.  For 0 '
+    'use half the number of vCPUs.')
+flags.DEFINE_integer('mpi_timeout', 60, 'MPI testing timeout (seconds).')
+flags.DEFINE_integer(
+    'mpi_iterations', 100000,
+    'Number of times to run an individual benchmark for a given byte size.')
+flags.DEFINE_bool('mpi_include_zero_byte', False,
+                  'Whether to include a 0 byte payload in runs.')
+_MSG_SIZES = flags.DEFINE_multi_integer(
+    'mpi_msglog_sizes', [], ('List of 2^n byte sizes to use.  '
+                             'Example: [2,8] will use 4 and 64 byte payloads.'))
+_MSG_SIZE_MIN = flags.DEFINE_integer('mpi_msglog_min', 10,
+                                     '2^n byte message min size.')
+_MSG_SIZE_MAX = flags.DEFINE_integer('mpi_msglog_max', 11,
+                                     '2^n byte message max size.')
+flags.DEFINE_integer(
+    'mpi_off_cache_size', -1,
+    'Avoids cache-size (use --mpi_off_cache_size= to reuse '
+    'cache, but that gives unrealistic numbers.  -1 uses the '
+    'value in IMB_mem_info.h.')
+flags.DEFINE_integer('mpi_off_cache_line_size', None,
+                     'Size of a last level cache line.')
+# For more info on --mpi_ppn changes the MPI rank assignment see
+# https://software.intel.com/en-us/articles/controlling-process-placement-with-the-intel-mpi-library
+flags.DEFINE_integer(
+    'mpi_ppn', 0, 'Processes/Ranks per node. Defaults to not setting a ppn '
+    'when running tests, instead relying on -map to place threads.')
+
+flags.DEFINE_list(
+    'mpi_env', ['I_MPI_DEBUG=6'],
+    'Comma separated list of environment variables, e.g. '
+    '--mpi_env=FI_PROVIDER=tcp,FI_LOG_LEVEL=info '
+    'Default set to output MPI pinning debugging information.')
+flags.DEFINE_list(
+    'mpi_genv', [], 'Comma separated list of global environment variables, '
+    'i.e. environment variables to be applied to all nodes, e.g. '
+    '--mpi_genv=I_MPI_PIN_PROCESSOR_LIST=0,I_MPI_PIN=1')
+flags.DEFINE_bool('mpi_record_latency', True,
+                  'Whether to record the individual packet latencies.')
+flags.DEFINE_integer(
+    'mpi_npmin', None, 'Minimum number of processes to use. For IMB, this '
+    'becomes -npmin. If unspecified, no attempt will be made to specify the '
+    'minimum number of processes (i.e. the application defaults will prevail).')
+flags.DEFINE_bool(
+    'mpi_tune', False,
+    'Whether to instruct the mpirun command to use data collected by an MPI '
+    'tuning utility like mpitune, e.g. by passing -tune to mpirun. Consider '
+    'using in conjunction with specifying the tuning data directory, e.g. for '
+    'Intel MPI setting I_MPI_TUNER_DATA_DIR.')
+flags.DEFINE_bool(
+    'mpi_multi', True,
+    'Whether to instruct the mpirun command to set -multi and run with '
+    'multiple number of groups as opposed to just one.')
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = 'mpi'
+
+BENCHMARK_CONFIG = """
+mpi:
+  description: Runs the MPI benchmarks
+  vm_groups:
+    default:
+      vm_count: 2
+      vm_spec:
+        GCP:
+          machine_type: n1-standard-4
+          zone: us-west1-a
+        AWS:
+          machine_type: c5.xlarge
+          zone: us-west-1c
+        Azure:
+          machine_type: Standard_B2s
+          zone: eastus
+"""
+
+# these columns in the MPI output data are surfaces as sample.Sample.metrics
+_METRIC_NAMES = frozenset(['time_avg', 'time_overall'])
+
+flags.register_validator(
+    'mpi_suites',
+    lambda suites: set(suites) <= set(_MPI_SUITE_TESTS),
+    message='--mpi_suites values must be in {}'.format(
+        sorted(_MPI_SUITE_TESTS.keys())))
+
+flags.register_validator(
+    'mpi_env',
+    lambda env_params: all('=' in param for param in env_params),
+    message='--mpi_env values must be in format "key=value" or "key="')
+
+flags.register_validator(
+    'mpi_genv',
+    lambda genv_params: all('=' in param for param in genv_params),
+    message='--mpi_genv values must be in format "key=value" or "key="')
+
+
+def GetConfig(user_config: Dict[str, Any]) -> Dict[str, Any]:
+  """Returns the benchmark config to use.
+
+  Args:
+    user_config: Pre-defined config.
+
+  Raises:
+     InvalidFlagConfigurationError if user supplied flags are incorrect.
+  """
+  if _MSG_SIZES.value:
+    if FLAGS['mpi_msglog_min'].present or FLAGS['mpi_msglog_max'].present:
+      raise errors.Setup.InvalidFlagConfigurationError(
+          'If --mpi_msglog_sizes set cannot set '
+          '--mpi_msglog_min or --mpi_msglog_min')
+  if _BENCHMARKS.value:
+    all_tests = set()
+    for tests in _MPI_SUITE_TESTS.values():
+      all_tests.update(_LowerList(tests))
+    unknown_tests = set(_LowerList(_BENCHMARKS.value)).difference(all_tests)
+    if unknown_tests:
+      raise errors.Setup.InvalidFlagConfigurationError(
+          f'Unknown MPI benchmarks: "{",".join(sorted(unknown_tests))}"')
+  config = configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+  if FLAGS['num_vms'].present:
+    config['vm_groups']['default']['vm_count'] = FLAGS.num_vms
+  return config
+
+
+def Prepare(spec: benchmark_spec.BenchmarkSpec) -> None:
+  vms = spec.vms
+  vm_util.RunThreaded(lambda vm: vm.AuthenticateVm(), vms)
+  logging.info('Installing mpi package')
+  vm_util.RunThreaded(lambda vm: vm.Install('mpi'), vms)
+  mpi.VerifyInstall(vms)
+
+
+def Run(spec: benchmark_spec.BenchmarkSpec) -> List[sample.Sample]:
+  """Runs all of the MPI tests.
+
+  Args:
+    spec: The benchmark spec.
+
+  Returns:
+    List of sample.Samples
+  """
+  vms = spec.vms
+  # for --mpi_threads=0 the threads per host is half of the number of vCPUs
+  samples = []
+  # The count of real CPUs on the VM: for SMT runs (the default) it is one half
+  # of the number of vCPUs.  When SMT is disabled it is the count of the CPUs.
+  real_cpus = vms[0].NumCpusForBenchmark(True)
+  for process_count in FLAGS.mpi_threads:
+    process_count = process_count or real_cpus
+    # Indicates whether the run is using the optimal HPC configuration of one
+    # thread per real (1/2 of vCPUs) CPUs.
+    on_real_cpus = process_count == real_cpus
+    samples.extend(
+        _RunTest(vms,
+                 process_count * len(vms),  # this is num ranks
+                 FLAGS.mpi_ppn, on_real_cpus))
+  for item in samples:
+    # TODO(user) reenable installing MKL when Intel repos work
+    # google3/cloud/performance/artemis/internal_packages/internal_intelmpi.py;l=65
+    item.metadata['installed_mkl'] = False
+  return samples
+
+
+def _RunTest(vms: List[_BaseLinuxVirtualMachine], total_processes: int,
+             ppn: int, on_real_cpus: bool) -> List[sample.Sample]:
+  """Runs the MPI test for this given number of processes per host.
+
+  Args:
+    vms: List of virtual machines to use in the test.
+    total_processes: The total number of processes to run across all nodes.
+    ppn: Processes per node.
+    on_real_cpus: Whether the number of MPI processes is equal to the number of
+      real CPUs (vCPUs / 2)
+
+  Returns:
+    List of sample.Samples.
+  """
+  # metadata that's constant for all runs
+  samples = []
+  for suite in FLAGS.mpi_suites:
+    for request in _CreateRequestWithFlagParameters(
+        vms=vms,
+        total_processes=total_processes,
+        suite=suite,
+        tests=_GetTests(suite),
+        ppn=ppn):
+      response = mpi.RunMpiStats(vms[0], request)
+      for item in _CreateSamples(response):
+        item.metadata['mpi_suite'] = suite
+        samples.append(item)
+  # Fill in metadata common to all samples.
+  hosts = [vm.internal_ip for vm in vms]
+  for item in samples:
+    item.metadata.update({
+        'compile_from_source': FLAGS.imb_compile_from_source,
+        'threads_half_cpus': on_real_cpus,
+        'smt_enabled': vms[0].IsSmtEnabled(),
+        'threads': total_processes,
+        'number_nodes': len(hosts),
+        'nodes': str(','.join(sorted(hosts))),
+        'processes_per_host': total_processes // len(hosts),
+        'ppn': ppn,
+        'mpi_env': ','.join(sorted(FLAGS.mpi_env + FLAGS.mpi_genv)),
+        'tune': FLAGS.mpi_tune,
+    })
+    for mpi_item in FLAGS.mpi_env + FLAGS.mpi_genv:
+      key, value = mpi_item.split('=', 1)
+      item.metadata['mpi_env_' + key] = value
+  return samples
+
+
+def _CreateSamples(response: mpi.MpiResponse) -> Iterator[sample.Sample]:
+  """Generates samples for each result in the response."""
+  for result in response.results:
+    for row in result.data:
+      for item in _MpiDataToSamples(row):
+        item.metadata.update({
+            'mpi_run': response.mpi_run,
+            'mpi_args': response.args,
+            'mpi_vendor': response.vendor,
+            'mpi_version': response.version,
+            'mpi_benchmark': result.benchmark,
+        })
+        if result.groups is not None:
+          item.metadata['mpi_groups'] = result.groups
+        if result.processes_per_group is not None:
+          item.metadata['mpi_processes_per_group'] = result.processes_per_group
+          if result.groups is not None:
+            item.metadata[
+                'mpi_ranks'] = result.processes_per_group * result.groups
+          else:  # only one group => ranks = ppg
+            item.metadata['mpi_ranks'] = result.processes_per_group
+        if result.mode:
+          item.metadata['mpi_mode'] = result.mode
+        if result.group_layout:
+          # Convert {0: [1,2], 1: [3,4]} into '0=1,2;1=3,4'
+          layout = []
+          for group_number, cpu_ids in sorted(result.group_layout.items()):
+            layout.append(f'{group_number}='
+                          f'{",".join(str(cpu) for cpu in cpu_ids)}')
+          item.metadata['mpi_layout'] = ';'.join(layout)
+        else:
+          item.metadata['mpi_layout'] = None
+        if response.mpi_pinning:
+          item.metadata['mpi_pinning'] = ';'.join(response.mpi_pinning)
+        if response.mpi_env:
+          mpi_env = sorted(response.mpi_env.items())
+          item.metadata['mpi_running_env'] = ';'.join(
+              f'{key}={value}' for key, value in mpi_env)
+        yield item
+
+
+def _MpiDataToSamples(row: mpi.MpiData) -> List[sample.Sample]:
+  """Returns the individual MPI result row as a list of Samples.
+
+  MpiData stores the results of a run for a given benchmark ("PingPong") that
+  specifies the:
+    bytes=(integer payload byte size for the run)
+    is_error=(whether this run was timed out)
+  and has one or both of the following if the run did not time out:
+    data={dict of latency percentages : latency in usec}
+    histogram={dict of latency in usec : count of packets}
+
+  This method returns [Samples] as the dict of latencies and the histogram dict
+  are reported as individual samples.
+
+  Args:
+    row: A latency/histogram value for a given number of bytes.
+  """
+  if row.is_error:
+    metadata = {'bytes': row.bytes, 'mpi_timeout': FLAGS.mpi_timeout}
+    # value=1 so that the timeline chart can show a blip when this happens
+    return [sample.Sample('timeout_error', 1, 'count', metadata)]
+  found_metrics = _METRIC_NAMES.intersection(row.data)
+  if not found_metrics:
+    logging.warning('Skipping row %s as missing a required metric name %s', row,
+                    _METRIC_NAMES)
+    return []
+  metric = list(found_metrics)[0]
+  ret = [sample.Sample(metric, row.data[metric], 'usec', row.data)]
+  if row.histogram:
+    # change the key of the histogram to a string to match existing TCP_RR data
+    metadata = {
+        'histogram': {
+            str(latency): count for latency, count in row.histogram.items()
+        },
+    }
+    ret.append(sample.Sample('MPI_Latency_Histogram', 0, 'usec', metadata))
+  for item in ret:
+    item.metadata.update({
+        'bytes': row.bytes,
+        'repetitions': row.repetitions,
+    })
+  return ret
+
+
+def Cleanup(spec: benchmark_spec.BenchmarkSpec) -> None:
+  del spec  # Unused
+
+
+def _CreateRequestWithFlagParameters(vms: List[_BaseLinuxVirtualMachine],
+                                     total_processes: int, suite: str,
+                                     tests: List[str],
+                                     ppn: int) -> Iterator[mpi.MpiRequest]:
+  """Yields an MpiRequest using settings passed in as flags.
+
+  If told to record MPI latencies (--mpi_record_latency) then must create
+  individual runs for each byte length.  Flags of --mpi_msglog_min=10,
+  --mpi_msglog_max=12 generates 3 MpiRequests of (msglog_min=10,msglog_max=10),
+  (msglog_min=11,msglog_max=11), (msglog_min=12,msglog_max=12)
+
+  Args:
+    vms: List of VMs to run on.
+    total_processes: The total number of MPI processes to run over all VMs.
+    suite: The name of the MPI suite to run.
+    tests: The individual MPI tests to run.  An MpiRequest is created for each.
+    ppn: The Processes Per Node, passed along to mpirun.
+  """
+  msglog_sizes: List[Tuple[int, int]] = []
+  if _MSG_SIZES.value:
+    msglog_sizes = [(size, size) for size in _MSG_SIZES.value]
+  else:
+    if FLAGS.mpi_record_latency:
+      # MUST pass in only one size at a time to the mpirun command
+      # to get a single dump file for the run
+      msglog_sizes = [
+          (size, size)
+          for size in range(FLAGS.mpi_msglog_min, FLAGS.mpi_msglog_max + 1)
+      ]
+    else:
+      msglog_sizes = [(FLAGS.mpi_msglog_min, FLAGS.mpi_msglog_max)]
+  for test in tests:
+    for msglog_min, msglog_max in msglog_sizes:
+      yield mpi.MpiRequest(
+          vms=vms,
+          total_processes=total_processes,
+          suite=suite,
+          tests=[test],
+          ppn=ppn,
+          msglog_min=msglog_min,
+          msglog_max=msglog_max,
+          timeout=FLAGS.mpi_timeout,
+          off_cache_size=FLAGS.mpi_off_cache_size,
+          off_cache_line_size=FLAGS.mpi_off_cache_line_size,
+          iterations=FLAGS.mpi_iterations,
+          include_zero_byte=FLAGS.mpi_include_zero_byte,
+          compile_from_source=FLAGS.imb_compile_from_source,
+          environment=FLAGS.mpi_env,
+          global_environment=FLAGS.mpi_genv,
+          record_latencies=FLAGS.mpi_record_latency,
+          npmin=FLAGS.mpi_npmin,
+          tune=FLAGS.mpi_tune,
+          multi=FLAGS.mpi_multi)
+
+
+def _LowerList(elements: List[str]) -> List[str]:
+  """Returns the list with all items lowercased."""
+  return [item.lower() for item in elements]
+
+
+def _GetTests(suite: str) -> List[str]:
+  """Returns the tests to run for this benchmark run.
+
+  Args:
+    suite: The MPI suite to use.
+
+  Returns:
+    List of individual benchmarks to run.
+  """
+  tests = _BENCHMARKS.value or _MPI_SUITE_TESTS[suite]
+  all_tests = set(_LowerList(_MPI_SUITE_TESTS[suite]))
+  return [test for test in tests if test.lower() in all_tests]
diff --git a/perfkitbenchmarker/linux_packages/imb.py b/perfkitbenchmarker/linux_packages/imb.py
new file mode 100644
index 0000000000..75b9cdfb19
--- /dev/null
+++ b/perfkitbenchmarker/linux_packages/imb.py
@@ -0,0 +1,209 @@
+r"""Installs MPI library (Intel or OpenMPI) and compiles Intel MPI benchmarks (IMB) from source."""
+import logging
+import posixpath
+from typing import List, Optional
+
+from absl import flags
+from perfkitbenchmarker.linux_packages import intel_repo
+from perfkitbenchmarker.linux_packages import intelmpi
+
+FLAGS = flags.FLAGS
+
+COMPILE_FROM_SOURCE = flags.DEFINE_bool(
+    'imb_compile_from_source', True,
+    'Whether to compile the Intel MPI benchmarks from source.')
+
+_INTEL_DIR = '/opt/intel'
+_INTEL_COMPILER_DIR = posixpath.join(_INTEL_DIR,
+                                     'compilers_and_libraries/linux')
+_INTEL_COMPILER_DIR_2020 = posixpath.join(_INTEL_DIR,
+                                          'compilers_and_libraries_2020/linux')
+
+# TBB: Intel's "Thread Building Blocks" for multithreaded programs
+# https://en.wikipedia.org/wiki/Threading_Building_Blocks
+_INTEL_FIX_TBBROOT_CMD = (
+    "sudo sed -i 's"
+    "#TBBROOT=SUBSTITUTE_INSTALL_DIR_HERE#TBBROOT={compiler_dir}/tbb#' "
+    '{compiler_dir}/tbb/bin/tbbvars.sh')
+
+# Source for the Intel MPI benchmarks
+_GITHUB_URL = 'https://github.com/intel/mpi-benchmarks.git'
+_GITHUB_COMMIT = '2d752544461f04111efef0926efe46826d90f720'
+# Directory for the MPI benchmarks
+_MPI_BENCHMARK_DIR = 'mpi-benchmarks'
+# Checks out the Intel MPI benchmarks
+_GIT_CHECKOUT_CMD = (f'git clone -n {_GITHUB_URL}; cd mpi-benchmarks; '
+                     f'git checkout {_GITHUB_COMMIT}')
+
+# Patch file and command to add latency histogram to Intel test code
+_PATCH_FILE = 'intelmpi.patch'
+_GIT_PATCH_CMD = f'patch -d {_MPI_BENCHMARK_DIR} -p3 < ~/{_PATCH_FILE}'
+
+# Enable verbose logging when mpirun fails due to a segfault
+_ENABLE_VERBOSE_SEGFAULT_LOGS = ('echo 1 | sudo tee -a '
+                                 '/proc/sys/kernel/print-fatal-signals')
+
+
+def _InstallForIntelMpiLibrary(
+    vm) -> None:
+  """Compiles the Intel MPI benchmarks for Intel MPI library."""
+  if intel_repo.UseOneApi():
+    vm.InstallPackages('intel-oneapi-compiler-dpcpp-cpp')
+    vm.InstallPackages('intel-oneapi-mpi-devel')  # for mpi.h
+    source_cmds = f'. {intel_repo.ONEAPI_VARS_FILE}'
+  else:
+    source_cmds = (f'. {_INTEL_DIR}/mkl/bin/mklvars.sh intel64; '
+                   f'. {_INTEL_COMPILER_DIR}/bin/compilervars.sh intel64')
+    for compiler_dir in (_INTEL_COMPILER_DIR, _INTEL_COMPILER_DIR_2020):
+      vm.RemoteCommand(
+          _INTEL_FIX_TBBROOT_CMD.format(compiler_dir=compiler_dir),
+          ignore_failure=True)
+  vm.RemoteCommand(_GIT_CHECKOUT_CMD)
+  vm.PushDataFile(_PATCH_FILE)
+  vm.RemoteCommand(_GIT_PATCH_CMD)
+  # Default make uses the Intel compiler (mpiicc) not available in repos
+  # {source_cmds} filled in at runtime due to differences in 2018/19 vs 2021
+  compile_benchmark_cmd = (
+      f'cd {_MPI_BENCHMARK_DIR}; {source_cmds}; CC=mpicc CXX=mpicxx make')
+  vm.RemoteCommand(compile_benchmark_cmd)
+  vm.RemoteCommand(_ENABLE_VERBOSE_SEGFAULT_LOGS)
+
+
+def _InstallForOpenMpiLibrary(
+    vm) -> None:
+  """Compiles the Intel MPI benchmarks for OpenMPI library."""
+  vm.RemoteCommand(_GIT_CHECKOUT_CMD)
+  vm.PushDataFile(_PATCH_FILE)
+  vm.RemoteCommand(_GIT_PATCH_CMD)
+  # When installing OpenMPI, openmpi.py runs ./configure.sh with --prefix=/usr.
+  compile_benchmark_cmd = (
+      f'cd {_MPI_BENCHMARK_DIR}; CC=/usr/bin/mpicc CXX=/usr/bin/mpicxx make')
+  vm.RemoteCommand(compile_benchmark_cmd)
+  vm.RemoteCommand(_ENABLE_VERBOSE_SEGFAULT_LOGS)
+
+
+def Install(vm) -> None:
+  """Installs MPI lib and compiles the Intel MPI benchmarks from source.
+
+  Args:
+    vm: Virtual machine to run on.
+  """
+  if FLAGS.mpi_vendor == 'intel':
+    mpilib = 'intelmpi'
+    install_benchmarks = _InstallForIntelMpiLibrary
+  elif FLAGS.mpi_vendor == 'openmpi':
+    if not COMPILE_FROM_SOURCE.value:
+      raise ValueError(
+          f'--mpi_vendor=openmpi requires --{COMPILE_FROM_SOURCE.name}')
+    mpilib = 'openmpi'
+    install_benchmarks = _InstallForOpenMpiLibrary
+
+  vm.Install(mpilib)
+  if not COMPILE_FROM_SOURCE.value:
+    return
+  logging.info('Installing Intel MPI benchmarks from source')
+  vm.Install('build_tools')
+  install_benchmarks(vm)
+
+
+def _MpiRunCommandForIntelMpiLibrary(
+    vm, hosts: List[str],
+    total_processes: int, ppn: int, environment: List[str],
+    global_environment: List[str], tune: bool) -> str:
+  """String command to call mpirun using Intel MPI library.
+
+  See Intel docs for details:
+    https://software.intel.com/content/www/us/en/develop/documentation/mpi-developer-guide-linux/top/running-applications/controlling-process-placement.html
+
+  "If the -ppn option is not specified, the process manager assigns as many
+  processes to the first node as there are physical cores on it. Then the next
+  node is used."
+
+  If the ppn should not be specified in the command pass in ppn=0.  However you
+  most likely want to pass it in so that the number of processes on each node
+  is balanced.
+
+  Args:
+    vm: Virtual machine to run on.
+    hosts: List of internal IP addresses to run on.
+    total_processes: The total number of processes to use across all hosts.
+    ppn: Number of processes per node to use when assigning processes per node.
+    environment: List of environment variables to set, e.g. "FI_PROVIDER=tcp".
+    global_environment: List of global environment variables to set via the
+      '-genv' option to mpirun, e.g. "I_MPI_PIN_PROCESSOR_LIST=0".
+    tune: Whether to pass -tune. If true, consider setting the
+      I_MPI_TUNER_DATA_DIR environment variable.
+
+  Returns:
+    String command to use in a vm.RemoteCommand call.
+  """
+  cmd_elements = [f'{intelmpi.SourceMpiVarsCommand(vm)};']
+  cmd_elements.extend(sorted(environment))
+  cmd_elements.append('mpirun')
+  if tune:
+    cmd_elements.append('-tune')
+  cmd_elements.extend(
+      f'-genv {variable}' for variable in sorted(global_environment))
+  cmd_elements.append(f'-n {total_processes}')
+  # hosts MUST remain in same order so that latency file created on first host
+  hosts_str = ','.join(hosts)
+  cmd_elements.append(f'-hosts {hosts_str}')
+  if ppn:
+    cmd_elements.append(f'-ppn {ppn}')
+  elif total_processes == len(hosts):
+    # for single-threaded runs tell MPI to run one thread on each VM
+    cmd_elements.append('-ppn 1')
+  return ' '.join(cmd_elements)
+
+
+def _MpiRunCommandForOpenMpiLibrary(hosts: List[str], total_processes: int,
+                                    npernode: int,
+                                    environment: List[str]) -> str:
+  """String command to call mpirun using OpenMPI library.
+
+  Args:
+    hosts: List of internal IP addresses to run on.
+    total_processes: Translates directly to mpirun's -n option.
+    npernode: Translates directly to mpirun's -npernode option. If 0, then
+      -npernode is set to total_processes//len(hosts).
+    environment: List of envionrment variables to export via mpirun -x. E.g.
+      "OMPI_MCA_btl=self,tcp" or "OMPI_MCA_rmaps_base_mapping_policy=core:PE=1".
+      See https://www.open-mpi.org/doc/v3.0/man1/mpirun.1.php for details.
+
+  Returns:
+    String command to use in a vm.RemoteCommand call.
+  """
+
+  cmd_elements = [f'{env_var}' for env_var in environment]
+  cmd_elements.append('mpirun')
+  cmd_elements.extend(
+      [f'-x {env_var.split("=", 1)[0]}' for env_var in environment])
+
+  # Useful for verifying process mapping.
+  cmd_elements.append('-report-bindings')
+  cmd_elements.append('-display-map')
+
+  cmd_elements.append(f'-n {total_processes}')
+  if not npernode:
+    npernode = total_processes // len(hosts)
+  cmd_elements.append(f'-npernode {npernode}')
+  cmd_elements.append('--use-hwthread-cpus')
+  # Guarantee that each host has sufficient slots (conservatively).
+  hosts_str = ','.join([f'{h}:slots={total_processes}' for h in hosts])
+  cmd_elements.append(f'-host {hosts_str}')
+
+  return ' '.join(cmd_elements)
+
+
+def MpiRunCommand(vm,
+                  hosts: List[str], total_processes: int, ppn: int,
+                  environment: List[str], global_environment: List[str],
+                  tune: bool) -> Optional[str]:
+  """String command to call mpirun."""
+  if FLAGS.mpi_vendor == 'intel':
+    return _MpiRunCommandForIntelMpiLibrary(vm, hosts, total_processes, ppn,
+                                            environment, global_environment,
+                                            tune)
+  elif FLAGS.mpi_vendor == 'openmpi':
+    return _MpiRunCommandForOpenMpiLibrary(hosts, total_processes, ppn,
+                                           environment)
diff --git a/perfkitbenchmarker/linux_packages/mpi.py b/perfkitbenchmarker/linux_packages/mpi.py
new file mode 100644
index 0000000000..79677ff360
--- /dev/null
+++ b/perfkitbenchmarker/linux_packages/mpi.py
@@ -0,0 +1,641 @@
+"""Installs the MPI library and runs the IMB-MPI1 tests.
+
+Installation of the MPI library is handed off into imb.py, as
+compilation of the benchmarks must be done differently depending on the MPI
+library being used.
+
+The run_benchmarks.sh script is copied to the remote server and runs the MPI
+tests. The text output is parsed by MpiResultParser.
+"""
+
+import collections
+import dataclasses
+import logging
+import os
+import posixpath
+import re
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union
+import uuid
+
+from absl import flags
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import temp_dir
+from perfkitbenchmarker.linux_packages import imb
+from perfkitbenchmarker.linux_packages import intelmpi
+from perfkitbenchmarker.linux_packages import omb
+from perfkitbenchmarker.linux_packages import openmpi
+
+FLAGS = flags.FLAGS
+
+
+@dataclasses.dataclass
+class MpiData:
+  """Data for an MPI run, including headers.
+
+  A row of the MPI run could be a timeout error or the actual data for a run
+  stored in the data attribute.  A typical value for that is
+  {
+    "throughput":10.13,
+    "time_avg":193.73,
+    "time_max":202.26,
+    "time_min":186.85
+  }
+  """
+  bytes: Optional[int] = None
+  repetitions: Optional[int] = None
+  data: Optional[Dict[str, Union[int, float]]] = None
+  is_error: bool = False
+  histogram: Optional[Dict[float, int]] = None
+
+
+@dataclasses.dataclass
+class MpiResult:
+  """Individual runs of a MPI benchmark test.
+
+  For example this could be the PingPong run results.
+  """
+  benchmark: str
+  data: List[MpiData]
+  groups: Optional[int] = None
+  processes_per_group: Optional[int] = None
+  mode: Optional[str] = None
+  group_layout: Optional[Dict[int, List[int]]] = None
+
+
+@dataclasses.dataclass
+class MpiResponse:
+  """Response to the RunMpiStats call."""
+  mpi_run: str
+  args: str
+  vendor: str
+  version: str
+  results: List[MpiResult]
+  mpi_pinning: List[str]
+  mpi_env: Dict[str, str]
+
+
+@dataclasses.dataclass
+class MpiRequest:
+  """Parameters for running an MPI test.
+
+  See the FLAGS.mpi_XXX definitions in mpi_benchmark.py for the definitions.
+  """
+  # TODO(andytzhu): Get rid of Any, and handle importing linux_virtual_machine
+  # without encountering circular dependencies
+  vms: List[Any]  # virtual machine
+  total_processes: int
+  suite: str
+  tests: List[str]
+  ppn: int
+  msglog_min: int
+  msglog_max: int
+  timeout: int
+  off_cache_size: int
+  off_cache_line_size: Optional[int]
+  iterations: int
+  include_zero_byte: bool
+  compile_from_source: bool
+  environment: List[str] = dataclasses.field(default_factory=list)
+  global_environment: List[str] = dataclasses.field(default_factory=list)
+  record_latencies: bool = False
+  npmin: Optional[int] = None
+  tune: bool = False
+  multi: bool = False
+
+
+# The same order as the output in the print_tail function in the patched code
+LATENCY_HEADERS: List[str] = [
+    'latency_min', 'latency_p10', 'latency_p25', 'latency_p50', 'latency_p75',
+    'latency_p90', 'latency_p95', 'latency_p99', 'latency_p99.5',
+    'latency_p99.9', 'latency_p99.99', 'latency_max'
+]
+
+# Regexs for parsing I_MPI_DEBUG=4 and higher output
+_MPI_STARTUP_PREFIX = r'^\[(?P<unused_cpuid>\d+)\] MPI startup\(\):\s+'
+_MPI_ENV_RE = re.compile(_MPI_STARTUP_PREFIX +
+                         r'(?P<mpi_var>I_MPI.*?)=(?P<mpi_value>.*)')
+
+
+def Install(vm) -> None:
+  """See base class."""
+  # Installs imb, which installs the specified MPI library and compiles
+  # the patched MPI benchmark appropriately for the specified MPI library.
+  vm.Install('imb')
+  VerifyInstall([vm])
+  logging.info('Successfully installed MPI on %s', vm)
+
+
+def VerifyInstall(vms) -> None:
+  """Runs a simple test to confirm MPI is installed correctly.
+
+  Args:
+    vms: List of virtual machines to include in the test.
+  """
+  request = MpiRequest(
+      vms=vms,
+      total_processes=vms[0].NumCpusForBenchmark(),
+      suite='IMB-MPI1',
+      tests=['PingPong'],
+      ppn=vms[0].NumCpusForBenchmark(),
+      msglog_min=10,
+      msglog_max=11,
+      timeout=20,
+      off_cache_size=-1,
+      off_cache_line_size=None,
+      iterations=100,
+      include_zero_byte=False,
+      compile_from_source=True,
+      record_latencies=False,
+      multi=True)
+  RunMpiStats(vms[0], request)
+
+
+def GetMpiVersion(vm) -> Optional[str]:
+  """Returns the MPI version to use for the given OS type."""
+  if FLAGS.mpi_vendor == 'intel':
+    return intelmpi.MPI_VERSION.value
+  elif FLAGS.mpi_vendor == 'openmpi':
+    return openmpi.GetMpiVersion(vm)
+
+
+def RunMpiStats(vm, request: MpiRequest) -> MpiResponse:
+  """Runs the MPI tests.
+
+  The first return value is all the command line arguments used to run the
+  text except for the names of the hosts.  This is so that the results in the
+  database can all have a common value to filter on.
+
+  Args:
+    vm: virtual machine to run on
+    request: an MpiRequest that has the parameters for this test
+
+  Returns:
+    MpiResponse named tuple.
+  """
+  hosts = [vm.internal_ip for vm in request.vms]
+
+  mpirun = imb.MpiRunCommand(vm, hosts, request.total_processes, request.ppn,
+                             request.environment, request.global_environment,
+                             request.tune)
+  if request.record_latencies:
+    latency_file = '/tmp/latency-{}-{}.txt'.format(request.tests[0],
+                                                   uuid.uuid4().hex[:8])
+  else:
+    latency_file = None
+  common = ' '.join(
+      BuildMpiBenchmarkArgs(request, latency_file, bool(request.ppn)))
+  try:
+    stdout, stderr = vm.RobustRemoteCommand(mpirun + ' ' + common)
+  except errors.VirtualMachine.RemoteCommandError:
+    # tail last 100 lines of syslog as might tell us something
+    for client_vm in request.vms:
+      logging.info('VM syslog for %s', client_vm.name)
+      client_vm.RemoteCommand(
+          'sudo tail -n 100 /var/log/syslog /var/log/messages || exit')
+    raise
+  if stderr:
+    # SSH displays a warning but this could also contain mpirun errors
+    logging.warning('Stderr when running MPI command: %s', stderr)
+  lines = stdout.splitlines()
+  results = list(MpiResultParser(lines))
+  if latency_file:
+    latencies = _GroupLatencyLines(vm, latency_file, request.iterations)
+    if latencies:
+      _CreateMpiDataForHistogram(latencies, results)
+  return MpiResponse(
+      mpi_run=mpirun,
+      args=common,
+      vendor=FLAGS.mpi_vendor,
+      version=GetMpiVersion(vm),
+      results=results,
+      mpi_pinning=omb.ParseMpiPinning(lines),
+      mpi_env=ParseMpiEnv(lines))
+
+
+class MpiResultParser(Iterable[MpiResult]):
+  """Parses the output of the MPI tests.
+
+  This is an iterator where each next item is an MpiResult.
+  """
+  _NAME = re.compile('^# Benchmarking (.*)')
+  _GROUP1 = re.compile(
+      r'.*?(?P<groups>\d+) groups of (?P<processes>\d+) processes')
+  _GROUP2 = re.compile(r'# #processes = (?P<processes>\d+)')
+  _GROUP_LAYOUT = re.compile(r'# Group\s+(\d+):\s+(.*)')
+  _GROUP_LAYOUT_FOLLOWON = re.compile(r'#\s+(\d+[\s\d]*)')
+  _HEADERS = re.compile(r'.*#repetitions.*usec')
+  _TIMEOUT = re.compile(r'(\d+) time-out')
+  _MODE = re.compile(r'#\s+MODE: (\S+)')
+  # for "t[usec]": https://software.intel.com/en-us/imb-user-guide-put-all-local
+  _MPI_HEADER_MAPPING = {
+      '#bytes': 'bytes',
+      '#repetitions': 'repetitions',
+      't_min[usec]': 'time_min',
+      't_max[usec]': 'time_max',
+      't_avg[usec]': 'time_avg',
+      'Mbytes/sec': 'throughput',
+      'Msg/sec': 'messages_per_sec',
+      't_ovrl[usec]': 'time_overall',
+      't_pure[usec]': 'time_pure',
+      't_CPU[usec]': 'time_cpu',
+      'overlap[%]': 'overlap_percent',
+      't[usec]': 'time_avg',
+  }
+  # these columns are integers, others are floats
+  _INT_COLUMNS = set(['bytes', 'repetitions'])
+
+  def __init__(self, lines: Sequence[str]):
+    # _lines is a iterator over the input parameter lines
+    self._lines = (line.strip() for line in lines)
+
+  def __iter__(self) -> Iterator[MpiResult]:
+    """Yields the next MpiResult from the input lines."""
+    while True:
+      value = self._NextValue()
+      if value:
+        yield value
+      else:
+        break
+
+  def _NextValue(self) -> Optional[MpiResult]:
+    """Returns the next MpiResult or None if no more entries."""
+    name = self._BenchmarkName()
+    if not name:
+      return None
+    logging.info('Parsing benchmark %s', name)
+    groups, processes = self._NumberGroups()
+    group_layout = self._GroupLayout()
+    mode, headers = self._Headers()
+    data = []
+    # if the previous run timed out don't record the bogus latency numbers
+    last_row_is_error: bool = False
+    for row in self._Data(headers):
+      if not last_row_is_error:
+        data.append(row)
+      last_row_is_error = row.is_error
+    return MpiResult(name, data, groups, processes, mode, group_layout)
+
+  def _BenchmarkName(self) -> Optional[str]:
+    for line in self._lines:
+      m = self._NAME.match(line)
+      if m:
+        return m.group(1)
+
+  def _NumberGroups(self) -> Tuple[Optional[int], int]:
+    """Return a tuple of the number of MPI groups and processes for the test."""
+    for line in self._lines:
+      m = self._GROUP1.match(line)
+      if m:
+        # this MPI test has both the "groups" and "processes" attributes
+        return int(m.group('groups')), int(m.group('processes'))
+      m = self._GROUP2.match(line)
+      if m:
+        # This MPI test does not have a "groups" attribute, but "processes".
+        return None, int(m.group('processes'))
+    raise errors.Benchmarks.RunError('Did not find number of processes')
+
+  def _GroupLayout(self) -> Optional[Dict[int, List[int]]]:
+    """Returns the MPI group CPU layout.
+
+    Parses this input:
+
+    # Group  0:     0    1
+    #
+    # Group  1:     2    3
+    #
+    #---------------------------------------------------
+
+    Into {0: [0,1], 1: [2,3]}
+    """
+    layout = {}
+    last_group_number = -1  # to satisfy pytyping
+    for line in self._lines:
+      m = self._GROUP_LAYOUT.match(line)
+      if not m and not layout:
+        # no group layout in this output
+        return None
+      if m:
+        last_group_number = int(m.group(1))
+        layout[last_group_number] = [int(cpu) for cpu in m.group(2).split()]
+        continue
+      # check for a continuation of the list of cpus
+      m = self._GROUP_LAYOUT_FOLLOWON.match(line)
+      if m:
+        layout[last_group_number] = [int(cpu) for cpu in m.group(1).split()]
+        continue
+      if not re.match(r'^#\s*$', line):
+        # Only other acceptable line is blank
+        break
+    return layout
+
+  def _Headers(self) -> Tuple[Optional[str], Sequence[str]]:
+    """Returns a tuple of (benchmark mode, List of headers for data)."""
+    mode = None
+    for line in self._lines:
+      m = self._MODE.match(line)
+      if m:
+        mode = m.group(1)
+        continue
+      m = self._HEADERS.match(line)
+      if m:
+        return mode, line.split()
+    raise errors.Benchmarks.RunError('No headers found')
+
+  def _Data(self, headers: Sequence[str]) -> Iterator[MpiData]:
+    """Yields MpiData for each row of a benchmark's results.
+
+    Example input:
+            0      1000000         1.17         1.17         1.17         0.00
+    [ 0.83, 0.97, 0.98, 1.00, 1.02, 1.72, 1.75, 2.28, 3.12, 6.73, 65.19 ]
+         1024      1000000         1.80         1.80         1.80       569.96
+    [ 1.16, 1.27, 1.29, 1.81, 2.06, 2.17, 2.40, 3.46, 4.34, 10.27, 215.10 ]
+
+    Will yield 2 MpiData records
+
+    Args:
+      headers: The headers for this row of data.
+    """
+    # Keep the last non-latency data row as the next row might contain the
+    # percent latency numbers for it.
+    on_deck: MpiData = None
+    for line in self._lines:
+      if not line:
+        break
+      m = self._TIMEOUT.match(line)
+      if m:
+        # This is a timeout error
+        if on_deck:  # emit the last row if available
+          yield on_deck
+        yield MpiData(is_error=True, bytes=int(m.group(1)))
+        on_deck: MpiData = None
+      elif line.startswith('['):
+        # This is [p_min, p10, p..] list of latencies
+        values: List[float] = [
+            float(part.strip()) for part in line[1:-1].split(',')
+        ]
+        percentiles: Dict[str, float] = dict(zip(LATENCY_HEADERS, values))
+        if not on_deck:
+          logging.warning('Have percentiles but no previous mpidata %s',
+                          percentiles)
+          continue
+        if sum(values) == 0.0:
+          # only tests that have been patched have the percentile metrics
+          logging.info('No percentiles data for benchmark')
+        else:
+          on_deck.data.update(percentiles)
+        yield on_deck
+        on_deck: MpiData = None
+      else:
+        # This is the regular MPI output of time_avg
+        if on_deck:
+          yield on_deck
+        data = self._DataIntoMap(headers, line.split())
+        number_bytes = data.pop('bytes', 0)
+        repetitions = data.pop('repetitions', -1)
+        on_deck = MpiData(
+            bytes=number_bytes, repetitions=repetitions, data=data)
+    if on_deck:
+      # Last record in this stanza was a normal MPI row.
+      yield on_deck
+
+  def _DataIntoMap(self, headers: Sequence[str],
+                   data: Sequence[str]) -> Dict[str, Union[int, float]]:
+    """Converts the a row of data from the MPI results into a dict.
+
+    Args:
+      headers:  The column headers.
+      data: A row of data from the MPI output.
+
+    Returns:
+      Dict of the header name to the value.
+    """
+    row = {}
+    for raw_header, raw_value in zip(headers, data):
+      new_header = self._MPI_HEADER_MAPPING[raw_header]
+      row[new_header] = self._ConvertValue(new_header, raw_value)
+    return row
+
+  def _ConvertValue(self, header: str, value: str) -> Union[int, float]:
+    return int(value) if header in self._INT_COLUMNS else float(value)
+
+
+def BuildMpiBenchmarkArgs(request: MpiRequest, latency_file: Optional[str],
+                          ppn_set: bool) -> List[str]:
+  """Creates the common arguments to pass to mpirun.
+
+  See https://software.intel.com/en-us/imb-user-guide-command-line-control
+
+  Args:
+    request: An MpiRequest object for the run's configuration.
+    latency_file: If present the output file to record the individual packet
+      latencies.
+    ppn_set: Whether this benchmark was run with a set ppn.
+
+  Returns:
+    List of string arguments for mpirun.
+  """
+  args: List[str] = []
+  if request.compile_from_source:
+    args.append(posixpath.join('mpi-benchmarks', request.suite))
+  else:
+    args.append(request.suite)
+  # only add -msglog if told to do so
+  if request.suite in ('IMB-MPI1', 'IMB-RMA',
+                       'IMB-NBC') and request.msglog_max is not None:
+    if request.msglog_min is None:
+      arg = request.msglog_max
+    else:
+      arg = '{}:{}'.format(request.msglog_min, request.msglog_max)
+    args.append('-msglog {}'.format(arg))
+  if request.suite != 'IMB-MT':
+    # -multi is trinary: not present, 0, 1
+    if request.multi:
+      args.append('-multi 0')
+    args.append('-time {}'.format(request.timeout))
+    # only add -off_cache if told to do so
+    if request.off_cache_size:
+      arg = '-off_cache {}'.format(request.off_cache_size)
+      if request.off_cache_line_size:
+        arg += ',{}'.format(request.off_cache_line_size)
+      args.append(arg)
+    args.append('-iter {}'.format(request.iterations))
+    if request.npmin is not None:
+      args.append(f'-npmin {request.npmin}')
+  # Setting iter_policy to off to collect the same number of samples every time.
+  args.append('-iter_policy off')
+  if not request.include_zero_byte:
+    args.append('-zero_size off')
+  # MPI benchmark tests will ignore this option if not present
+  args.append('-show_tail yes')
+  if latency_file:
+    args.append(f'-dumpfile {latency_file}')
+  if not ppn_set:
+    # only use -map if the --mpi_ppn was not set
+    number_hosts = len(request.vms)
+    processes_per_host = request.total_processes // number_hosts
+    args.append(f'-map {processes_per_host}x{number_hosts}')
+  args.extend(request.tests)
+  return args
+
+
+def _CreateMpiDataForHistogram(grouped_lines: List[List[str]],
+                               results: List[MpiResult]) -> None:
+  """Adds histogram data from the histogram file to existing data.
+
+  The MPI parsed results are passed in as some benchmarks runs can do many
+  sub-runs of different MPI group values.  This code pairs up those runs done
+  in order with the latency file that has all the runs concatenated together.
+
+  Args:
+    grouped_lines: The histogram text file lines grouped by sub-run.
+    results: The parsed MPI results from the non-histogram data.
+  """
+  acceptable_mpi_data: List[MpiData] = []
+  # MPI runs that time out should not have histogram data associated with it.
+  for result in results:
+    acceptable_mpi_data.extend(
+        mpi_data for mpi_data in result.data if not mpi_data.is_error)
+  histograms: List[MpiData] = []
+  for lines in grouped_lines:
+    histograms.extend(_CombineHistogramEntries(lines))
+  if _MpiHistogramAcceptable(acceptable_mpi_data, histograms):
+    for mpi_data, histogram in zip(acceptable_mpi_data, histograms):
+      mpi_data.histogram = histogram.histogram
+
+
+def _MpiHistogramAcceptable(mpi_data: List[MpiData],
+                            histograms: List[MpiData]) -> bool:
+  """Returns whether the parsed MpiResults MpiData matches with the histograms.
+
+  Criteria:
+    Number of MpiResults.data[] entries are the same.
+    The number of bytes for each MpiData matches.
+    The number of repetitions for each MpiData matches.
+
+  Args:
+    mpi_data: List of MpiData parsed for this run.
+    histograms: List of MpiData histograms parsed for this run.
+  """
+  if len(mpi_data) != len(histograms):
+    logging.warning('Have %s parsed MPI data but only %s histograms',
+                    len(mpi_data), len(histograms))
+    return False
+  for mpi_data, histogram in zip(mpi_data, histograms):
+    bytes_same = mpi_data.bytes == histogram.bytes
+    repetitions_same = mpi_data.repetitions == histogram.repetitions
+    if not bytes_same or not repetitions_same:
+      logging.warning('Parsed MPI data %s does not match with histogram %s',
+                      mpi_data, histogram)
+      return False
+  return True
+
+
+def _CombineHistogramEntries(lines: Iterable[str]) -> Iterator[MpiData]:
+  """Converts the -dumpfile latency file into MpiData.
+
+  The latency file lines are in this form:
+    integer_bytes latency_usec
+  For example this is for a run with one latency value of 11.0usec for the
+  bytes=1024 run and three values for bytes=2048 of 12.1,13.5, and 13.5 usec:
+    1024 11
+    2048 12.1
+    2048 13.5
+    2048 13.5
+
+  The number of MpiDatas returned is equal to the unique number of bytes=###
+  runs in the input.  The MpiData's "histogram" field will be populated with a
+  dict where the key is the latency in microseconds and the value is the number
+  of times that latency has been seen.
+
+  Args:
+    lines: The lines from the latency dump file.
+
+  Yields:
+    An MpiData that has the histogram of latencies for all runs of a particular
+    number of bytes.
+  """
+  latencies = collections.defaultdict(list)
+  for line in lines:
+    # format of file is "integer_bytes latency_usec"
+    parts = line.strip().split()
+    if len(parts) == 2:
+      latencies[int(parts[0])].append(float(parts[1]))
+    else:
+      logging.warning('Latency file line "%s" should have two parts', line)
+  if not latencies:
+    logging.warning('No latency entries found')
+  for number_bytes, times in sorted(latencies.items()):
+    histogram = collections.Counter()
+    for item in times:
+      # Round the sub-microsecond latency based on the latency value to reduce
+      # the number of latency histogram keys.
+      # Under 5 usec: 0.01usec accuracy.  5-40 usec: 0.1usec, 40+ usec: 1usec
+      if item < 5:
+        item = round(item, 2)
+      elif item < 40:
+        item = round(item, 1)
+      else:
+        item = round(item, 0)
+      histogram[item] += 1
+    yield MpiData(
+        bytes=number_bytes,
+        histogram=dict(histogram),
+        repetitions=sum(histogram.values()))
+
+
+def _GroupLatencyLines(vm, latency_file: str,
+                       packets_per_run: int) -> List[List[str]]:
+  r"""Parses the histogram latency file copied from the remote VM.
+
+  The latency file contains multiple sub-runs concatenated together.  Each of
+  those runs is of length packets_per_run.  The returned file is chunked into
+  groups of that size.
+
+  Example: ("1\n2\n3\n4\n5\n6", 2) => [["1","2"],["3","4"],["5","6"]]
+
+  Args:
+    vm: The virtual machine that has the histogram file.
+    latency_file: Path to the latency file on the VM.
+    packets_per_run: The number of packets (lines) for each test run.
+
+  Returns:
+    List of lists of strings of length packets_per_run or an empty list if there
+    is a problem dividing up the lines into groups.
+  """
+  local_file: str = os.path.join(temp_dir.GetRunDirPath(),
+                                 os.path.basename(latency_file))
+  if vm.TryRemoteCommand(f'test -f {latency_file}'):
+    vm.PullFile(local_file, latency_file)
+  else:
+    logging.warning('Skipping gathering latency as %s file missing',
+                    latency_file)
+    return []
+  with open(local_file) as reader:
+    lines = [line.strip() for line in reader.readlines()]
+  number_groups = len(lines) // packets_per_run
+  if packets_per_run * number_groups != len(lines):
+    logging.warning('File %s has %s lines, cannot be divided into size %s',
+                    local_file, len(lines), packets_per_run)
+    return []
+  return [
+      lines[i:i + packets_per_run]
+      for i in range(0, len(lines), packets_per_run)
+  ]
+
+
+def ParseMpiEnv(lines: Sequence[str]) -> Dict[str, str]:
+  """Reads the log file for environment parameters.
+
+  Args:
+    lines: Text lines from mpirun output.
+
+  Returns:
+    Dict of the MPI envirnoment variables
+  """
+  mpi_env = {}
+  for line in lines:
+    row = _MPI_ENV_RE.search(line)
+    if not row:
+      continue
+    mpi_env[row['mpi_var']] = row['mpi_value']
+  return mpi_env
diff --git a/tests/data/mpi/mpi_allgather_output.txt b/tests/data/mpi/mpi_allgather_output.txt
new file mode 100644
index 0000000000..d096c743ed
--- /dev/null
+++ b/tests/data/mpi/mpi_allgather_output.txt
@@ -0,0 +1,324 @@
+#------------------------------------------------------------
+#    Intel(R) MPI Benchmarks 2019 Update 6, MPI-1 part
+#------------------------------------------------------------
+# Date                  : Sat Apr 10 00:55:13 2021
+# Machine               : x86_64
+# System                : Linux
+# Release               : 3.10.0-1160.15.2.el7.x86_64
+# Version               : #1 SMP Wed Feb 3 15:06:38 UTC 2021
+# MPI Version           : 3.1
+# MPI Thread Environment:
+
+
+# Calling sequence was:
+
+# mpi-benchmarks/IMB-MPI1 -msglog 9:9 -multi 0 -time 60 -off_cache -1 -iter 100000 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-Allgather-1dabaabb.txt Allgather
+
+# Minimum message length in bytes:   512
+# Maximum message length in bytes:   512
+#
+# MPI_Datatype                   :   MPI_BYTE
+# MPI_Datatype for reductions    :   MPI_FLOAT
+# MPI_Op                         :   MPI_SUM
+#
+#
+
+# List of Benchmarks to run:
+
+# Allgather
+
+#----------------------------------------------------------------
+# Benchmarking Multi-Allgather
+# ( 60 groups of 2 processes each running simultaneous )
+# Group  0:     0    1
+#
+# Group  1:     2    3
+#
+# Group  2:     4    5
+#
+# Group  3:     6    7
+#
+# Group  4:     8    9
+#
+# Group  5:    10   11
+#
+# Group  6:    12   13
+#
+# Group  7:    14   15
+#
+# Group  8:    16   17
+#
+# Group  9:    18   19
+#
+# Group  10:    20   21
+#
+# Group  11:    22   23
+#
+# Group  12:    24   25
+#
+# Group  13:    26   27
+#
+# Group  14:    28   29
+#
+# Group  15:    30   31
+#
+# Group  16:    32   33
+#
+# Group  17:    34   35
+#
+# Group  18:    36   37
+#
+# Group  19:    38   39
+#
+# Group  20:    40   41
+#
+# Group  21:    42   43
+#
+# Group  22:    44   45
+#
+# Group  23:    46   47
+#
+# Group  24:    48   49
+#
+# Group  25:    50   51
+#
+# Group  26:    52   53
+#
+# Group  27:    54   55
+#
+# Group  28:    56   57
+#
+# Group  29:    58   59
+#
+# Group  30:    60   61
+#
+# Group  31:    62   63
+#
+# Group  32:    64   65
+#
+# Group  33:    66   67
+#
+# Group  34:    68   69
+#
+# Group  35:    70   71
+#
+# Group  36:    72   73
+#
+# Group  37:    74   75
+#
+# Group  38:    76   77
+#
+# Group  39:    78   79
+#
+# Group  40:    80   81
+#
+# Group  41:    82   83
+#
+# Group  42:    84   85
+#
+# Group  43:    86   87
+#
+# Group  44:    88   89
+#
+# Group  45:    90   91
+#
+# Group  46:    92   93
+#
+# Group  47:    94   95
+#
+# Group  48:    96   97
+#
+# Group  49:    98   99
+#
+# Group  50:   100  101
+#
+# Group  51:   102  103
+#
+# Group  52:   104  105
+#
+# Group  53:   106  107
+#
+# Group  54:   108  109
+#
+# Group  55:   110  111
+#
+# Group  56:   112  113
+#
+# Group  57:   114  115
+#
+# Group  58:   116  117
+#
+# Group  59:   118  119
+#
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000        30.48        37.96        33.80
+[ 1.91, 21.93, 27.89, 33.14, 40.05, 47.92, 53.17, 67.00, 74.15, 108.00, 1500.00, 1789.09 ]
+
+#----------------------------------------------------------------
+# Benchmarking Multi-Allgather
+# ( 30 groups of 4 processes each running simultaneous )
+# Group  0:     0    1    2    3
+#
+# Group  1:     4    5    6    7
+#
+# Group  2:     8    9   10   11
+#
+# Group  3:    12   13   14   15
+#
+# Group  4:    16   17   18   19
+#
+# Group  5:    20   21   22   23
+#
+# Group  6:    24   25   26   27
+#
+# Group  7:    28   29   30   31
+#
+# Group  8:    32   33   34   35
+#
+# Group  9:    36   37   38   39
+#
+# Group  10:    40   41   42   43
+#
+# Group  11:    44   45   46   47
+#
+# Group  12:    48   49   50   51
+#
+# Group  13:    52   53   54   55
+#
+# Group  14:    56   57   58   59
+#
+# Group  15:    60   61   62   63
+#
+# Group  16:    64   65   66   67
+#
+# Group  17:    68   69   70   71
+#
+# Group  18:    72   73   74   75
+#
+# Group  19:    76   77   78   79
+#
+# Group  20:    80   81   82   83
+#
+# Group  21:    84   85   86   87
+#
+# Group  22:    88   89   90   91
+#
+# Group  23:    92   93   94   95
+#
+# Group  24:    96   97   98   99
+#
+# Group  25:   100  101  102  103
+#
+# Group  26:   104  105  106  107
+#
+# Group  27:   108  109  110  111
+#
+# Group  28:   112  113  114  115
+#
+# Group  29:   116  117  118  119
+#
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000        64.04        92.11        75.38
+[ 9.06, 43.87, 56.03, 72.96, 92.98, 113.96, 128.03, 157.12, 171.18, 253.20, 1500.00, 6736.99 ]
+
+#----------------------------------------------------------------
+# Benchmarking Multi-Allgather
+# ( 15 groups of 8 processes each running simultaneous )
+# Group  0:     0    1    2    3    4    5    6    7
+#
+# Group  1:     8    9   10   11   12   13   14   15
+#
+# Group  2:    16   17   18   19   20   21   22   23
+#
+# Group  3:    24   25   26   27   28   29   30   31
+#
+# Group  4:    32   33   34   35   36   37   38   39
+#
+# Group  5:    40   41   42   43   44   45   46   47
+#
+# Group  6:    48   49   50   51   52   53   54   55
+#
+# Group  7:    56   57   58   59   60   61   62   63
+#
+# Group  8:    64   65   66   67   68   69   70   71
+#
+# Group  9:    72   73   74   75   76   77   78   79
+#
+# Group  10:    80   81   82   83   84   85   86   87
+#
+# Group  11:    88   89   90   91   92   93   94   95
+#
+# Group  12:    96   97   98   99  100  101  102  103
+#
+# Group  13:   104  105  106  107  108  109  110  111
+#
+# Group  14:   112  113  114  115  116  117  118  119
+#
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000        69.61        97.04        82.71
+[ 14.07, 48.88, 63.90, 81.06, 101.09, 120.88, 133.99, 166.89, 189.07, 357.87, 1500.00, 7099.87 ]
+
+#----------------------------------------------------------------
+# Benchmarking Multi-Allgather
+# ( 7 groups of 16 processes each running simultaneous )
+# Group  0:     0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
+#
+# Group  1:    16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31
+#
+# Group  2:    32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47
+#
+# Group  3:    48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63
+#
+# Group  4:    64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79
+#
+# Group  5:    80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95
+#
+# Group  6:    96   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111
+#
+# ( 8 additional processes waiting in MPI_Barrier)
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000        77.43        93.56        84.53
+[ 15.97, 53.17, 67.00, 82.02, 97.99, 114.92, 128.03, 158.07, 179.05, 507.83, 1500.00, 7114.89 ]
+
+#----------------------------------------------------------------
+# Benchmarking Multi-Allgather
+# ( 3 groups of 32 processes each running simultaneous )
+# Group  0:     0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
+#    16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31
+#
+# Group  1:    32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47
+#    48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63
+#
+# Group  2:    64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79
+#    80   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95
+#
+# ( 24 additional processes waiting in MPI_Barrier)
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000        84.96        93.30        89.08
+[ 20.98, 56.98, 70.10, 86.07, 103.95, 123.02, 136.14, 174.05, 247.00, 1224.04, 1500.00, 11262.89 ]
+
+#----------------------------------------------------------------
+# Benchmarking Allgather
+# #processes = 64
+# ( 56 additional processes waiting in MPI_Barrier)
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000        89.38       135.46       113.14
+[ 70.81, 116.11, 120.16, 125.89, 133.99, 145.91, 154.97, 183.11, 231.98, 1411.91, 1500.00, 6277.08 ]
+
+#----------------------------------------------------------------
+# Benchmarking Allgather
+# #processes = 120
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+          512       100000       127.62       227.38       177.07
+[ 139.00, 195.03, 205.99, 219.11, 233.89, 252.96, 263.93, 293.97, 313.04, 441.07, 1500.00, 1156.81 ]
+
+
+# All processes entering MPI_Finalize
+
diff --git a/tests/data/mpi/mpi_allgather_parsed.json b/tests/data/mpi/mpi_allgather_parsed.json
new file mode 100644
index 0000000000..084afd7ab0
--- /dev/null
+++ b/tests/data/mpi/mpi_allgather_parsed.json
@@ -0,0 +1,233 @@
+[
+  {
+    "benchmark": "Multi-Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 30.48,
+          "time_max": 37.96,
+          "time_avg": 33.8,
+          "latency_min": 1.91,
+          "latency_p10": 21.93,
+          "latency_p25": 27.89,
+          "latency_p50": 33.14,
+          "latency_p75": 40.05,
+          "latency_p90": 47.92,
+          "latency_p95": 53.17,
+          "latency_p99": 67.0,
+          "latency_p99.5": 74.15,
+          "latency_p99.9": 108.0,
+          "latency_p99.99": 1500.00,
+          "latency_max": 1789.09
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": {"0": [0, 1], "1": [2, 3], "2": [4, 5], "3": [6, 7], "4": [8, 9], "5": [10, 11], "6": [12, 13], "7": [14, 15], "8": [16, 17], "9": [18, 19], "10": [20, 21], "11": [22, 23], "12": [24, 25], "13": [26, 27], "14": [28, 29], "15": [30, 31], "16": [32, 33], "17": [34, 35], "18": [36, 37], "19": [38, 39], "20": [40, 41], "21": [42, 43], "22": [44, 45], "23": [46, 47], "24": [48, 49], "25": [50, 51], "26": [52, 53], "27": [54, 55], "28": [56, 57], "29": [58, 59], "30": [60, 61], "31": [62, 63], "32": [64, 65], "33": [66, 67], "34": [68, 69], "35": [70, 71], "36": [72, 73], "37": [74, 75], "38": [76, 77], "39": [78, 79], "40": [80, 81], "41": [82, 83], "42": [84, 85], "43": [86, 87], "44": [88, 89], "45": [90, 91], "46": [92, 93], "47": [94, 95], "48": [96, 97], "49": [98, 99], "50": [100, 101], "51": [102, 103], "52": [104, 105], "53": [106, 107], "54": [108, 109], "55": [110, 111], "56": [112, 113], "57": [114, 115], "58": [116, 117], "59": [118, 119]},
+    "groups": 60,
+    "processes_per_group": 2,
+    "mode": null
+  },
+  {
+    "benchmark": "Multi-Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 64.04,
+          "time_max": 92.11,
+          "time_avg": 75.38,
+          "latency_min": 9.06,
+          "latency_p10": 43.87,
+          "latency_p25": 56.03,
+          "latency_p50": 72.96,
+          "latency_p75": 92.98,
+          "latency_p90": 113.96,
+          "latency_p95": 128.03,
+          "latency_p99": 157.12,
+          "latency_p99.5": 171.18,
+          "latency_p99.9": 253.2,
+          "latency_p99.99": 1500.00,
+          "latency_max": 6736.99
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": {"0": [0, 1, 2, 3], "1": [4, 5, 6, 7], "10": [40, 41, 42, 43], "11": [44, 45, 46, 47], "12": [48, 49, 50, 51], "13": [52, 53, 54, 55], "14": [56, 57, 58, 59], "15": [60, 61, 62, 63], "16": [64, 65, 66, 67], "17": [68, 69, 70, 71], "18": [72, 73, 74, 75], "19": [76, 77, 78, 79], "2": [8, 9, 10, 11], "20": [80, 81, 82, 83], "21": [84, 85, 86, 87], "22": [88, 89, 90, 91], "23": [92, 93, 94, 95], "24": [96, 97, 98, 99], "25": [100, 101, 102, 103], "26": [104, 105, 106, 107], "27": [108, 109, 110, 111], "28": [112, 113, 114, 115], "29": [116, 117, 118, 119], "3": [12, 13, 14, 15], "4": [16, 17, 18, 19], "5": [20, 21, 22, 23], "6": [24, 25, 26, 27], "7": [28, 29, 30, 31], "8": [32, 33, 34, 35], "9": [36, 37, 38, 39]},
+    "groups": 30,
+    "processes_per_group": 4,
+    "mode": null
+  },
+  {
+    "benchmark": "Multi-Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 69.61,
+          "time_max": 97.04,
+          "time_avg": 82.71,
+          "latency_min": 14.07,
+          "latency_p10": 48.88,
+          "latency_p25": 63.9,
+          "latency_p50": 81.06,
+          "latency_p75": 101.09,
+          "latency_p90": 120.88,
+          "latency_p95": 133.99,
+          "latency_p99": 166.89,
+          "latency_p99.5": 189.07,
+          "latency_p99.9": 357.87,
+          "latency_p99.99": 1500.00,
+          "latency_max": 7099.87
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": {"0": [0, 1, 2, 3, 4, 5, 6, 7], "1": [8, 9, 10, 11, 12, 13, 14, 15], "2": [16, 17, 18, 19, 20, 21, 22, 23], "3": [24, 25, 26, 27, 28, 29, 30, 31], "4": [32, 33, 34, 35, 36, 37, 38, 39], "5": [40, 41, 42, 43, 44, 45, 46, 47], "6": [48, 49, 50, 51, 52, 53, 54, 55],
+                     "7": [56, 57, 58, 59, 60, 61, 62, 63], "8": [64, 65, 66, 67, 68, 69, 70, 71], "9": [72, 73, 74, 75, 76, 77, 78, 79], "10": [80, 81, 82, 83, 84, 85, 86, 87],
+                     "11": [88, 89, 90, 91, 92, 93, 94, 95], "12": [96, 97, 98, 99, 100, 101, 102, 103], "13": [104, 105, 106, 107, 108, 109, 110, 111], "14": [112, 113, 114, 115, 116, 117, 118, 119]},
+     "groups": 15,
+    "processes_per_group": 8,
+    "mode": null
+  },
+  {
+    "benchmark": "Multi-Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 77.43,
+          "time_max": 93.56,
+          "time_avg": 84.53,
+          "latency_min": 15.97,
+          "latency_p10": 53.17,
+          "latency_p25": 67.0,
+          "latency_p50": 82.02,
+          "latency_p75": 97.99,
+          "latency_p90": 114.92,
+          "latency_p95": 128.03,
+          "latency_p99": 158.07,
+          "latency_p99.5": 179.05,
+          "latency_p99.9": 507.83,
+          "latency_p99.99": 1500.00,
+          "latency_max": 7114.89
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": {"0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "1": [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
+                     "2": [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], "3": [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63],
+                     "4":  [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79], "5": [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95],
+                     "6" : [96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]},
+    "groups": 7,
+    "processes_per_group": 16,
+    "mode": null
+  },
+  {
+    "benchmark": "Multi-Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 84.96,
+          "time_max": 93.3,
+          "time_avg": 89.08,
+          "latency_min": 20.98,
+          "latency_p10": 56.98,
+          "latency_p25": 70.1,
+          "latency_p50": 86.07,
+          "latency_p75": 103.95,
+          "latency_p90": 123.02,
+          "latency_p95": 136.14,
+          "latency_p99": 174.05,
+          "latency_p99.5": 247.0,
+          "latency_p99.9": 1224.04,
+          "latency_p99.99": 1500.00,
+          "latency_max": 11262.89
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": {"0": [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
+                     "1": [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63],
+                     "2": [80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]},
+    "groups": 3,
+    "processes_per_group": 32,
+    "mode": null
+  },
+  {
+    "benchmark": "Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 89.38,
+          "time_max": 135.46,
+          "time_avg": 113.14,
+          "latency_min": 70.81,
+          "latency_p10": 116.11,
+          "latency_p25": 120.16,
+          "latency_p50": 125.89,
+          "latency_p75": 133.99,
+          "latency_p90": 145.91,
+          "latency_p95": 154.97,
+          "latency_p99": 183.11,
+          "latency_p99.5": 231.98,
+          "latency_p99.9": 1411.91,
+          "latency_p99.99": 1500.00,
+          "latency_max": 6277.08
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": null,
+    "groups": null,
+    "processes_per_group": 64,
+    "mode": null
+  },
+  {
+    "benchmark": "Allgather",
+    "data": [
+      {
+        "bytes": 512,
+        "repetitions": 100000,
+        "data": {
+          "time_min": 127.62,
+          "time_max": 227.38,
+          "time_avg": 177.07,
+          "latency_min": 139.0,
+          "latency_p10": 195.03,
+          "latency_p25": 205.99,
+          "latency_p50": 219.11,
+          "latency_p75": 233.89,
+          "latency_p90": 252.96,
+          "latency_p95": 263.93,
+          "latency_p99": 293.97,
+          "latency_p99.5": 313.04,
+          "latency_p99.9": 441.07,
+          "latency_p99.99": 1500.00,
+          "latency_max": 1156.81
+        },
+        "is_error": false,
+        "histogram": null
+      }
+    ],
+    "group_layout": null,
+    "groups": null,
+    "processes_per_group": 120,
+    "mode": null
+  }
+]
diff --git a/tests/data/mpi/mpi_barrier_output.txt b/tests/data/mpi/mpi_barrier_output.txt
new file mode 100644
index 0000000000..674c50f5d2
--- /dev/null
+++ b/tests/data/mpi/mpi_barrier_output.txt
@@ -0,0 +1,51 @@
+#------------------------------------------------------------
+#    Intel(R) MPI Benchmarks 2019 Update 6, MPI-1 part
+#------------------------------------------------------------
+# Date                  : Sun Aug 30 21:47:17 2020
+# Machine               : x86_64
+# System                : Linux
+# Release               : 4.15.0-1080-gcp
+# Version               : #90~16.04.1-Ubuntu SMP Fri Jul 10 19:11:10 UTC 2020
+# MPI Version           : 3.1
+# MPI Thread Environment:
+
+
+# Calling sequence was:
+
+# mpi-benchmarks/IMB-MPI1 -msglog 10:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -iter_policy off -zero_size off barrier
+
+# Minimum message length in bytes:   1024
+# Maximum message length in bytes:   2048
+#
+# MPI_Datatype                   :   MPI_BYTE
+# MPI_Datatype for reductions    :   MPI_FLOAT
+# MPI_Op                         :   MPI_SUM
+#
+#
+
+# List of Benchmarks to run:
+
+# Barrier
+
+#---------------------------------------------------
+# Benchmarking Multi-Barrier
+# ( 2 groups of 2 processes each running simultaneous )
+# Group  0:     0    1
+#
+# Group  1:     2    3
+#
+#---------------------------------------------------
+ #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+       100000        80.75        81.05        80.90
+
+#---------------------------------------------------
+# Benchmarking Barrier
+# #processes = 4
+#---------------------------------------------------
+ #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+       100000        91.24        91.24        91.24
+
+
+# All processes entering MPI_Finalize
+
+
diff --git a/tests/data/mpi/mpi_barrier_parsed.json b/tests/data/mpi/mpi_barrier_parsed.json
new file mode 100644
index 0000000000..d94b0ff6c5
--- /dev/null
+++ b/tests/data/mpi/mpi_barrier_parsed.json
@@ -0,0 +1,40 @@
+[
+  {
+    "data":[
+      {
+        "data":{
+          "time_min":80.75,
+          "time_max":81.05,
+          "time_avg":80.9
+        },
+        "is_error":false,
+        "repetitions":100000,
+        "bytes":0
+      }
+    ],
+    "processes_per_group":2,
+    "benchmark":"Multi-Barrier",
+    "mode":null,
+    "group_layout": {"0": [0, 1], "1": [2, 3]},
+    "groups":2
+  },
+  {
+    "data":[
+      {
+        "data":{
+          "time_min":91.24,
+          "time_max":91.24,
+          "time_avg":91.24
+        },
+        "is_error":false,
+        "repetitions":100000,
+        "bytes":0
+      }
+    ],
+    "processes_per_group":4,
+    "benchmark":"Barrier",
+    "mode":null,
+    "group_layout":null,
+    "groups":null
+  }
+]
diff --git a/tests/data/mpi/mpi_debug_output.txt b/tests/data/mpi/mpi_debug_output.txt
new file mode 100644
index 0000000000..af214f84e1
--- /dev/null
+++ b/tests/data/mpi/mpi_debug_output.txt
@@ -0,0 +1,11 @@
+[0] MPI startup(): libfabric version: 1.9.0a1-impi
+[0] MPI startup(): libfabric provider: tcp;ofi_rxm
+[0] MPI startup(): Rank    Pid      Node name       Pin cpu
+[0] MPI startup(): 0       18790    pkb-f92e1167-0  {0,1,2,3}
+[0] MPI startup(): 1       18944    pkb-f92e1167-1  {0,1,2,3}
+[0] MPI startup(): I_MPI_ROOT=/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi
+[0] MPI startup(): I_MPI_MPIRUN=mpirun
+[0] MPI startup(): I_MPI_HYDRA_TOPOLIB=hwloc
+[0] MPI startup(): I_MPI_INTERNAL_MEM_POLICY=default
+[0] MPI startup(): I_MPI_DEBUG=5
+#------------------------------------------------------------
diff --git a/tests/data/mpi/mpi_latencies_output.txt b/tests/data/mpi/mpi_latencies_output.txt
new file mode 100644
index 0000000000..61b53d0ea8
--- /dev/null
+++ b/tests/data/mpi/mpi_latencies_output.txt
@@ -0,0 +1,42 @@
+#------------------------------------------------------------
+#    Intel(R) MPI Benchmarks 2019 Update 6, MPI-1 part
+#------------------------------------------------------------
+# Date                  : Mon Jul  6 17:04:56 2020
+# Machine               : x86_64
+# System                : Linux
+# Release               : 5.3.0-1026-gcp
+# Version               : #28~18.04.1-Ubuntu SMP Sat Jun 6 00:09:26 UTC 2020
+# MPI Version           : 3.1
+# MPI Thread Environment:
+
+
+# Calling sequence was:
+
+# IMB-MPI1 pingpong -msglog 10:10 -multi 0 -show_tail yes -dumpfile /tmp/dump.txt -iter 1000000 -iter_policy off
+
+# Minimum message length in bytes:   0
+# Maximum message length in bytes:   1024
+#
+# MPI_Datatype                   :   MPI_BYTE
+# MPI_Datatype for reductions    :   MPI_FLOAT
+# MPI_Op                         :   MPI_SUM
+#
+#
+
+# List of Benchmarks to run:
+
+# PingPong
+
+#-----------------------------------------------------------------------------
+# Benchmarking PingPong
+# #processes = 2
+#-----------------------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]   Mbytes/sec
+            0      1000000         1.17         1.17         1.17         0.00
+[ 0.83, 0.97, 0.98, 1.00, 1.02, 1.72, 1.75, 2.28, 3.12, 6.73, 50.00, 65.19 ]
+         1024      1000000         1.80         1.80         1.80       569.96
+[ 1.16, 1.27, 1.29, 1.81, 2.06, 2.17, 2.40, 3.46, 4.34, 10.27, 50.00, 215.10 ]
+
+
+# All processes entering MPI_Finalize
+
diff --git a/tests/data/mpi/mpi_latencies_parsed.json b/tests/data/mpi/mpi_latencies_parsed.json
new file mode 100644
index 0000000000..7bf705ea50
--- /dev/null
+++ b/tests/data/mpi/mpi_latencies_parsed.json
@@ -0,0 +1,56 @@
+[
+  {
+    "data":[
+      {
+        "data":{
+          "latency_min":0.83,
+          "time_avg":1.17,
+          "latency_p99.5":3.12,
+          "latency_p99.99":50.00,
+          "time_max":1.17,
+          "latency_p99":2.28,
+          "latency_p10":0.97,
+          "latency_p75":1.02,
+          "latency_p95":1.75,
+          "latency_p50":1.0,
+          "latency_p99.9":6.73,
+          "latency_p90":1.72,
+          "throughput":0.0,
+          "latency_max":65.19,
+          "time_min":1.17,
+          "latency_p25":0.98
+        },
+        "is_error":false,
+        "repetitions":1000000,
+        "bytes":0
+      },
+      {
+        "data":{
+          "latency_min":1.16,
+          "time_avg":1.8,
+          "latency_p99.5":4.34,
+          "latency_p99.99":50.00,
+          "time_max":1.8,
+          "latency_p99":3.46,
+          "latency_p10":1.27,
+          "latency_p75":2.06,
+          "latency_p95":2.4,
+          "latency_p50":1.81,
+          "latency_p99.9":10.27,
+          "latency_p90":2.17,
+          "throughput":569.96,
+          "latency_max":215.1,
+          "time_min":1.8,
+          "latency_p25":1.29
+        },
+        "is_error":false,
+        "repetitions":1000000,
+        "bytes":1024
+      }
+    ],
+    "processes_per_group":2,
+    "benchmark":"PingPong",
+    "mode":null,
+    "groups":null
+  }
+]
diff --git a/tests/data/mpi/mpi_one_put_all_output.txt b/tests/data/mpi/mpi_one_put_all_output.txt
new file mode 100644
index 0000000000..d5e8d4fb7c
--- /dev/null
+++ b/tests/data/mpi/mpi_one_put_all_output.txt
@@ -0,0 +1,28 @@
+# mpi-benchmarks/IMB-RMA -msglog 0:0 -multi 0 -time 60 -off_cache -1 -iter 10 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-One_put_all-839321dc.txt One_put_all
+
+# Minimum message length in bytes:   0
+# Maximum message length in bytes:   1
+#
+# MPI_Datatype                   :   MPI_BYTE
+# MPI_Datatype for reductions    :   MPI_FLOAT
+# MPI_Op                         :   MPI_SUM
+#
+#
+
+# List of Benchmarks to run:
+
+# One_put_all
+Invalid benchmark name -zero_size
+Invalid benchmark name off
+Invalid benchmark name -show_tail
+Invalid benchmark name yes
+Invalid benchmark name -dumpfile
+Invalid benchmark name /tmp/latency-one_put_all-839321dc.txt
+
+#---------------------------------------------------
+# Benchmarking One_put_all
+# #processes = 2
+#---------------------------------------------------
+       #bytes #repetitions      t[usec]   Mbytes/sec
+            0           10         0.10         0.00
+            1           10        10.01         0.10
diff --git a/tests/data/mpi/mpi_one_put_all_parsed.json b/tests/data/mpi/mpi_one_put_all_parsed.json
new file mode 100644
index 0000000000..c67006c5c1
--- /dev/null
+++ b/tests/data/mpi/mpi_one_put_all_parsed.json
@@ -0,0 +1,31 @@
+[
+    {
+        "benchmark": "One_put_all",
+        "data": [
+            {
+                "bytes": 0,
+                "repetitions": 10,
+                "data": {
+                    "time_avg": 0.1,
+                    "throughput": 0.0
+                },
+                "is_error": false,
+                "histogram": null
+            },
+            {
+                "bytes": 1,
+                "repetitions": 10,
+                "data": {
+                    "time_avg": 10.01,
+                    "throughput": 0.1
+                },
+                "is_error": false,
+                "histogram": null
+            }
+        ],
+        "groups": null,
+        "processes_per_group": 2,
+        "mode": null,
+        "group_layout": null
+    }
+]
diff --git a/tests/data/mpi/mpi_pingpong_output.txt b/tests/data/mpi/mpi_pingpong_output.txt
new file mode 100644
index 0000000000..6f5d400e95
--- /dev/null
+++ b/tests/data/mpi/mpi_pingpong_output.txt
@@ -0,0 +1,44 @@
+#------------------------------------------------------------
+#    Intel(R) MPI Benchmarks 2019 Update 6, MPI-1 part
+#------------------------------------------------------------
+# Date                  : Sun Aug 30 18:36:31 2020
+# Machine               : x86_64
+# System                : Linux
+# Release               : 4.15.0-1080-gcp
+# Version               : #90~16.04.1-Ubuntu SMP Fri Jul 10 19:11:10 UTC 2020
+# MPI Version           : 3.1
+# MPI Thread Environment:
+
+
+# Calling sequence was:
+
+# mpi-benchmarks/IMB-MPI1 -msglog 10:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -iter_policy off -zero_size off pingpong
+
+# Minimum message length in bytes:   1024
+# Maximum message length in bytes:   2048
+#
+# MPI_Datatype                   :   MPI_BYTE
+# MPI_Datatype for reductions    :   MPI_FLOAT
+# MPI_Op                         :   MPI_SUM
+#
+#
+
+# List of Benchmarks to run:
+
+# PingPong
+
+#-----------------------------------------------------------------------------
+# Benchmarking Multi-PingPong
+# ( 2 groups of 2 processes each running simultaneous )
+# Group  0:     0    1
+#
+# Group  1:     2    3
+#
+#-----------------------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]   Mbytes/sec
+         1024       100000        56.92        61.39        59.15        16.68
+         2048       100000        64.01        68.79        66.40        29.77
+
+
+# All processes entering MPI_Finalize
+
diff --git a/tests/data/mpi/mpi_pingpong_parsed.json b/tests/data/mpi/mpi_pingpong_parsed.json
new file mode 100644
index 0000000000..5441343c15
--- /dev/null
+++ b/tests/data/mpi/mpi_pingpong_parsed.json
@@ -0,0 +1,33 @@
+[
+  {
+    "processes_per_group":2,
+    "benchmark":"Multi-PingPong",
+    "mode":null,
+    "group_layout": {"0": [0, 1], "1": [2, 3]},
+    "groups":2,
+    "data":[
+      {
+        "data":{
+          "throughput":16.68,
+          "time_min":56.92,
+          "time_max":61.39,
+          "time_avg":59.15
+        },
+        "is_error":false,
+        "repetitions":100000,
+        "bytes":1024
+      },
+      {
+        "data":{
+          "throughput":29.77,
+          "time_min":64.01,
+          "time_max":68.79,
+          "time_avg":66.4
+        },
+        "is_error":false,
+        "repetitions":100000,
+        "bytes":2048
+      }
+    ]
+  }
+]
diff --git a/tests/data/mpi/mpi_reduce_output.txt b/tests/data/mpi/mpi_reduce_output.txt
new file mode 100644
index 0000000000..18b5c490ca
--- /dev/null
+++ b/tests/data/mpi/mpi_reduce_output.txt
@@ -0,0 +1,41 @@
+#------------------------------------------------------------
+#    Intel(R) MPI Benchmarks 2019 Update 6, MPI-1 part
+#------------------------------------------------------------
+# Date                  : Sun Aug 30 22:14:36 2020
+# Machine               : x86_64
+# System                : Linux
+# Release               : 4.15.0-1080-gcp
+# Version               : #90~16.04.1-Ubuntu SMP Fri Jul 10 19:11:10 UTC 2020
+# MPI Version           : 3.1
+# MPI Thread Environment:
+
+
+# Calling sequence was:
+
+# mpi-benchmarks/IMB-MPI1 -msglog 10:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -iter_policy off -zero_size off reduce
+
+# Minimum message length in bytes:   1024
+# Maximum message length in bytes:   2048
+#
+# MPI_Datatype                   :   MPI_BYTE
+# MPI_Datatype for reductions    :   MPI_FLOAT
+# MPI_Op                         :   MPI_SUM
+#
+#
+
+# List of Benchmarks to run:
+
+# Reduce
+
+#----------------------------------------------------------------
+# Benchmarking Reduce
+# #processes = 2
+#----------------------------------------------------------------
+       #bytes #repetitions  t_min[usec]  t_max[usec]  t_avg[usec]
+         1024       100000         5.24        15.30        10.27
+         2048       100000         7.70        20.51        14.11
+
+
+# All processes entering MPI_Finalize
+
+
diff --git a/tests/data/mpi/mpi_reduce_parsed.json b/tests/data/mpi/mpi_reduce_parsed.json
new file mode 100644
index 0000000000..b220bb83bd
--- /dev/null
+++ b/tests/data/mpi/mpi_reduce_parsed.json
@@ -0,0 +1,30 @@
+[
+  {
+    "data":[
+      {
+        "data":{
+          "time_min":5.24,
+          "time_max":15.3,
+          "time_avg":10.27
+        },
+        "is_error":false,
+        "repetitions":100000,
+        "bytes":1024
+      },
+      {
+        "data":{
+          "time_min":7.7,
+          "time_max":20.51,
+          "time_avg":14.11
+        },
+        "is_error":false,
+        "repetitions":100000,
+        "bytes":2048
+      }
+    ],
+    "processes_per_group":2,
+    "benchmark":"Reduce",
+    "mode":null,
+    "groups":null
+  }
+]
diff --git a/tests/data/mpi/mpi_tests_samples.json b/tests/data/mpi/mpi_tests_samples.json
new file mode 100644
index 0000000000..ecaf6ce53f
--- /dev/null
+++ b/tests/data/mpi/mpi_tests_samples.json
@@ -0,0 +1,306 @@
+[
+ {
+  "metadata": {
+   "bytes": 1024,
+   "compile_from_source": true,
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 10:10 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_1.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "throughput": 16.68,
+   "time_avg": 59.15,
+   "time_max": 61.39,
+   "time_min": 56.92,
+   "tune": true
+  },
+  "metric": "time_avg",
+  "timestamp": 1600999678.0782478,
+  "unit": "usec",
+  "value": 59.15
+ },
+ {
+  "metadata": {
+   "bytes": 1024,
+   "compile_from_source": true,
+   "histogram": {
+    "10.0": 50000,
+    "12.5": 50000
+   },
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 10:10 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_1.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "tune": true
+  },
+  "metric": "MPI_Latency_Histogram",
+  "timestamp": 1600999678.078312,
+  "unit": "usec",
+  "value": 0.0
+ },
+ {
+  "metadata": {
+   "bytes": 2048,
+   "compile_from_source": true,
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 10:10 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_1.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "throughput": 29.77,
+   "time_avg": 66.4,
+   "time_max": 68.79,
+   "time_min": 64.01,
+   "tune": true
+  },
+  "metric": "time_avg",
+  "timestamp": 1600999678.0782793,
+  "unit": "usec",
+  "value": 66.4
+ },
+ {
+  "metadata": {
+   "bytes": 2048,
+   "compile_from_source": true,
+   "histogram": {
+    "50.0": 50000,
+    "6.0": 50000
+   },
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 10:10 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_1.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "tune": true
+  },
+  "metric": "MPI_Latency_Histogram",
+  "timestamp": 1601160331.9777563,
+  "unit": "usec",
+  "value": 0.0
+ },
+ {
+  "metadata": {
+   "bytes": 1024,
+   "compile_from_source": true,
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 11:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_2.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "throughput": 16.68,
+   "time_avg": 59.15,
+   "time_max": 61.39,
+   "time_min": 56.92,
+   "tune": true
+  },
+  "metric": "time_avg",
+  "timestamp": 1600999678.0792882,
+  "unit": "usec",
+  "value": 59.15
+ },
+ {
+  "metadata": {
+   "bytes": 1024,
+   "compile_from_source": true,
+   "histogram": {
+    "10.0": 50000,
+    "12.5": 50000
+   },
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 11:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_2.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "tune": true
+  },
+  "metric": "MPI_Latency_Histogram",
+  "timestamp": 1600999678.078312,
+  "unit": "usec",
+  "value": 0.0
+ },
+ {
+  "metadata": {
+   "bytes": 2048,
+   "compile_from_source": true,
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 11:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_2.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "throughput": 29.77,
+   "time_avg": 66.4,
+   "time_max": 68.79,
+   "time_min": 64.01,
+   "tune": true
+  },
+  "metric": "time_avg",
+  "timestamp": 1600999678.0830917,
+  "unit": "usec",
+  "value": 66.4
+ },
+ {
+  "metadata": {
+   "bytes": 2048,
+   "compile_from_source": true,
+   "histogram": {
+    "50.0": 50000,
+    "6.0": 50000
+   },
+   "mpi_args": "mpi-benchmarks/IMB-MPI1 -msglog 11:11 -multi 0 -time 60 -off_cache -1 -iter 100000 -npmin 2 -iter_policy off -zero_size off -show_tail yes -dumpfile /tmp/latency-PingPong-uuid_2.txt -map 16x1 PingPong",
+   "mpi_benchmark": "Multi-PingPong",
+   "mpi_env": "FI_LOG_LEVEL=info,FI_PROVIDER=tcp,I_MPI_PIN=1,I_MPI_PIN_PROCESSOR_LIST=0",
+   "mpi_env_FI_LOG_LEVEL": "info",
+   "mpi_env_FI_PROVIDER": "tcp",
+   "mpi_env_I_MPI_PIN": "1",
+   "mpi_env_I_MPI_PIN_PROCESSOR_LIST": "0",
+   "mpi_groups": 2,
+   "mpi_layout": "0=0,1;1=2,3",
+   "mpi_processes_per_group": 2,
+   "mpi_ranks": 4,
+   "mpi_run": ". /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh; FI_LOG_LEVEL=info FI_PROVIDER=tcp mpirun -tune -genv I_MPI_PIN=1 -genv I_MPI_PIN_PROCESSOR_LIST=0 -n 16 -hosts 10.0.0.2",
+   "mpi_suite": "IMB-MPI1",
+   "mpi_vendor": "intel",
+   "mpi_version": "2019.2-057",
+   "nodes": "10.0.0.2",
+   "number_nodes": 1,
+   "ppn": 0,
+   "processes_per_host": 16,
+   "repetitions": 100000,
+   "smt_enabled": true,
+   "threads": 16,
+   "threads_half_cpus": true,
+   "tune": true
+  },
+  "metric": "MPI_Latency_Histogram",
+  "timestamp": 1600999678.083103,
+  "unit": "usec",
+  "value": 0.0
+ }
+]
diff --git a/tests/linux_benchmarks/mpi_benchmark_test.py b/tests/linux_benchmarks/mpi_benchmark_test.py
new file mode 100644
index 0000000000..0c269129b0
--- /dev/null
+++ b/tests/linux_benchmarks/mpi_benchmark_test.py
@@ -0,0 +1,264 @@
+"""Tests for MPI benchmark."""
+
+from typing import List
+import unittest
+from unittest import mock
+import uuid
+
+from absl import flags
+from absl.testing import flagsaver
+from absl.testing import parameterized
+from perfkitbenchmarker import benchmark_spec
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import sample
+from perfkitbenchmarker import test_util
+from perfkitbenchmarker.linux_benchmarks import mpi_benchmark
+from perfkitbenchmarker.linux_packages import intelmpi
+from perfkitbenchmarker.linux_packages import mpi
+from tests import pkb_common_test_case
+from tests.linux_packages import mpi_test
+
+
+FLAGS = flags.FLAGS
+
+# Histogram results from reading MPI output file
+histogram1 = {'12.5': 50000, '10.0': 50000}
+histogram2 = {'6.0': 50000, '50.0': 50000}
+histogram_text = """\
+1024 12.51
+1024 10.01
+""" * 50000 + """\
+2048 6.00
+2048 50.0
+""" * 50000
+
+MPI_VARS = '/opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh'
+
+
+# All VMs have num_cpus=32
+class Vm(pkb_common_test_case.TestLinuxVirtualMachine):
+
+  def __init__(self,
+               smt_enabled=True,
+               ip='10.0.0.2',
+               robust_remote_command_text=None) -> None:
+    super(Vm, self).__init__(vm_spec=pkb_common_test_case.CreateTestVmSpec())
+    self.internal_ip = ip
+    self._num_cpus = 32
+    # pylint: disable=invalid-name
+    self.IsSmtEnabled = mock.PropertyMock(return_value=smt_enabled)
+    self.RemoteCommand = mock.PropertyMock(
+        return_value=('Version 2019 Update 2 Build 2019.2-057', ''))
+    self.RobustRemoteCommand = mock.PropertyMock(
+        return_value=((mpi_test.ReadMpiOutput('mpi_pingpong_output.txt'), '')))
+
+
+def MpiRun(vms) -> List[sample.Sample]:
+  benchmark_module = mock.Mock(BENCHMARK_NAME='mpi')
+  benchmark_config = mock.Mock(
+      vm_groups={}, relational_db=mock.Mock(vm_groups={}))
+  spec = benchmark_spec.BenchmarkSpec(benchmark_module, benchmark_config,
+                                      'abcdefg')
+  spec.vms = vms
+  return mpi_benchmark.Run(spec)
+
+
+class MpiBenchmarkTestCase(pkb_common_test_case.PkbCommonTestCase,
+                           test_util.SamplesTestMixin):
+
+  _METRIC_ERR = 'Metric values should be equal %s != %s'
+  _VALUE_ERR = 'Values should be equal %s != %s'
+  _UNIT_ERR = 'Unit values should be equal %s != %s'
+  # the latency dump file name uses uuid4()
+  _MOCK_UUIDS = [mock.PropertyMock(hex=f'uuid_{i}') for i in range(12)]
+
+  def setUp(self) -> None:
+    super(MpiBenchmarkTestCase, self).setUp()
+    FLAGS.mpi_benchmarks = ['PingPong']
+    FLAGS.intelmpi_version = '2019.2-057'
+    self.mock_histo = self.enter_context(
+        mock.patch.object(mpi, '_GroupLatencyLines'))
+    self.mock_histo.return_value = [histogram_text.splitlines()]
+    self.enter_context(
+        mock.patch.object(intelmpi, 'MpiVars', return_value=MPI_VARS))
+
+  @mock.patch.object(uuid, 'uuid4', side_effect=_MOCK_UUIDS)
+  def testRun(self, mock_uuid) -> None:
+    FLAGS.mpi_threads = [0]
+    FLAGS.mpi_env = ['FI_PROVIDER=tcp', 'FI_LOG_LEVEL=info']
+    FLAGS.mpi_genv = ['I_MPI_PIN_PROCESSOR_LIST=0', 'I_MPI_PIN=1']
+    FLAGS.mpi_npmin = 2
+    FLAGS.mpi_tune = True
+    FLAGS.mpi_multi = True
+    found = MpiRun([Vm()])
+    expected = []
+    for row in mpi_test.ReadJson('mpi_tests_samples.json'):
+      expected.append(sample.Sample(**row))
+      expected[-1].metadata['installed_mkl'] = False
+    self.assertSampleListsEqualUpToTimestamp(expected, found)
+    self.assertLen(expected, 8)
+    self.assertEqual(2, self.mock_histo.call_count)
+
+  @parameterized.parameters(
+      {
+          'threads': [0],
+          'num_vms': 1,
+          'expected_threads': [16]
+      },
+      {
+          'threads': [2, 6, 18],
+          'num_vms': 2,
+          'expected_threads': [4, 12, 36]
+      },
+      {
+          'threads': [0],
+          'num_vms': 1,
+          'expected_threads': [32],
+          'smt_enabled': False,  # this forces threads=num_cpus
+      },
+  )
+  @mock.patch.object(mpi_benchmark, '_RunTest')
+  def testRunTestCommand(self,
+                         mock_run: mock.Mock,
+                         num_vms: int,
+                         expected_threads: List[int],
+                         threads: List[int],
+                         smt_enabled: bool = True) -> None:
+    FLAGS.mpi_threads = threads
+    MpiRun([Vm(smt_enabled) for _ in range(num_vms)])
+    for total_processes, found in zip(expected_threads,
+                                      mock_run.call_args_list):
+      _, found_total_processes, found_ppn, _ = found[0]
+      self.assertEqual(total_processes, found_total_processes)
+      self.assertEqual(0, found_ppn)
+    self.assertLen(
+        mock_run.call_args_list, len(expected_threads),
+        'Missing / extra calls in {}'.format(mock_run.call_args_list))
+    self.mock_histo.assert_not_called()
+
+  @mock.patch.object(mpi, 'RunMpiStats')
+  def testRunMpiStatsCall(self, mock_mpistats: mock.Mock) -> None:
+    tests = ['PingPong', 'AllGather']
+    FLAGS.mpi_benchmarks = tests
+    vms = [Vm(ip='1.2.3.4'), Vm(ip='5.6.7.8')]
+    total_processes = 32
+    ppn = 0
+    mpi.RunMpiStats.return_value = mpi.MpiResponse('', '', '', '', [], [], {})
+    mpi_benchmark._RunTest(vms, total_processes, ppn, False)
+    # RunMpiStats called for each one of the --mpi_benchmarks and also for each
+    # of the msglog values: len(['PingPong','AllGather']) * len([10,11]) = 4
+    self.assertLen(mock_mpistats.call_args_list, 4)
+    # just test the last one run which is AllGather with msglog_min=11
+    mock_mpistats.assert_called_with(
+        vms[0],
+        mpi.MpiRequest(
+            vms=vms,
+            total_processes=total_processes,
+            suite='IMB-MPI1',
+            tests=[tests[-1]],
+            ppn=ppn,
+            msglog_min=11,
+            msglog_max=11,
+            timeout=60,
+            off_cache_size=-1,
+            off_cache_line_size=None,
+            iterations=100000,
+            include_zero_byte=False,
+            compile_from_source=True,
+            record_latencies=True,
+            environment=['I_MPI_DEBUG=6'],
+            multi=True))
+    self.mock_histo.assert_not_called()
+
+  @parameterized.parameters((True, 16), (False, 32))
+  def testSmtUsage(self, smt_enabled: bool, num_processes: int) -> None:
+    FLAGS.mpi_threads = [0]
+    data = MpiRun([Vm(smt_enabled)])
+    self.assertNotEmpty(data)
+    found = data[0].metadata
+    self.assertEqual(num_processes, found['processes_per_host'])
+    self.assertEqual(2, self.mock_histo.call_count)
+
+  def testHistoResults(self) -> None:
+    FLAGS.mpi_record_latency = True
+    # Returns with this histogram MpiData with every call to the method
+    data = MpiRun([Vm(False)])
+    self.assertLen(data, 16)
+    histogram_data = [
+        item for item in data if item.metric == 'MPI_Latency_Histogram'
+    ]
+    self.assertLen(histogram_data, 8)
+    meta1 = {
+        'bytes': 1024,
+        'mpi_groups': 2,
+        'mpi_processes_per_group': 2,
+        'histogram': histogram1
+    }
+    self.assertDictContainsSubset(meta1, histogram_data[0].metadata)
+    meta2 = {
+        'bytes': 2048,
+        'mpi_groups': 2,
+        'mpi_processes_per_group': 2,
+        'histogram': histogram2
+    }
+    self.assertDictContainsSubset(meta2, histogram_data[1].metadata)
+    self.assertEqual(4, self.mock_histo.call_count)
+
+  @flagsaver.flagsaver(mpi_benchmarks=['Qubert', 'Broadcast', 'allTOaLL'])
+  def testGetConfigBadBenchmark(self):
+    # Alltoall is a valid benchmark
+    with self.assertRaisesRegex(errors.Setup.InvalidFlagConfigurationError,
+                                '"broadcast,qubert"'):
+      mpi_benchmark.GetConfig({})
+
+  @flagsaver.flagsaver(mpi_benchmarks=['Bcast'], mpi_msglog_sizes=[20])
+  def testGetConfigNoErrors(self):
+    # Confirms that no exception is thrown
+    mpi_benchmark.GetConfig({})
+
+  @flagsaver.flagsaver(mpi_msglog_sizes=[20])
+  def testGetConfigBadMessageSizeFlags(self):
+    # Need to do .parse() so that FLAGS['mpi_msglog_min'].present resolves
+    FLAGS['mpi_msglog_min'].parse(10)
+    with self.assertRaises(errors.Setup.InvalidFlagConfigurationError):
+      mpi_benchmark.GetConfig({})
+
+  @flagsaver.flagsaver(mpi_suites=['IMB-MT'])
+  def testRunTestWithSuites(self):
+    FLAGS.mpi_benchmarks = []
+    # Mock response with no results as not testing that functionality
+    response = mpi.MpiResponse('a', 'b', 'c', 'd', [], [], {})
+    mpirun_mock = self.enter_context(
+        mock.patch.object(mpi, 'RunMpiStats', return_value=response))
+    vm = Vm()
+
+    mpi_benchmark._RunTest([vm], 2, 1, True)
+
+    expected_request = mpi.MpiRequest(
+        vms=[vm],
+        total_processes=2,
+        suite='IMB-MT',
+        tests=['UniBandMT'],
+        ppn=1,
+        msglog_min=11,
+        msglog_max=11,
+        timeout=60,
+        off_cache_size=-1,
+        off_cache_line_size=None,
+        iterations=100000,
+        include_zero_byte=False,
+        compile_from_source=True,
+        environment=['I_MPI_DEBUG=6'],
+        global_environment=[],
+        record_latencies=True,
+        npmin=None,
+        tune=False,
+        multi=True)
+    # Test the last one called
+    mpirun_mock.assert_called_with(vm, expected_request)
+    # It was called len(IMB-MT suite tests) times
+    self.assertLen(mpirun_mock.call_args_list, 20)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tests/linux_packages/imb_test.py b/tests/linux_packages/imb_test.py
new file mode 100644
index 0000000000..bba2e38021
--- /dev/null
+++ b/tests/linux_packages/imb_test.py
@@ -0,0 +1,182 @@
+"""Tests for Intel MPI benchmark."""
+
+import unittest
+from unittest import mock
+
+from absl.testing import flagsaver
+from absl.testing import parameterized
+from perfkitbenchmarker import os_types
+from perfkitbenchmarker.linux_packages import imb
+from perfkitbenchmarker.linux_packages import intelmpi
+# Required for --mpi_vendor flag.
+from perfkitbenchmarker.linux_packages import mpi  # pylint: disable=unused-import
+
+from tests import pkb_common_test_case
+
+
+def MockVm():
+  return mock.Mock(
+      internal_ip='1.2.3.4', NumCpusForBenchmark=8, BASE_OS_TYPE=os_types.RHEL)
+
+
+class IntelMpiLibTestCase(pkb_common_test_case.PkbCommonTestCase):
+
+  MPIVARS_FILE = ('/opt/intel/compilers_and_libraries/'
+                  'linux/mpi/intel64/bin/mpivars.sh')
+
+  COMPILE_2019 = ('cd mpi-benchmarks; '
+                  '. /opt/intel/mkl/bin/mklvars.sh intel64; '
+                  '. /opt/intel/compilers_and_libraries/'
+                  'linux/bin/compilervars.sh intel64; '
+                  'CC=mpicc CXX=mpicxx make')
+  COMPILE_2021 = ('cd mpi-benchmarks; '
+                  '. /opt/intel/oneapi/setvars.sh; '
+                  'CC=mpicc CXX=mpicxx make')
+
+  def setUp(self):
+    super().setUp()
+    self.enter_context(flagsaver.flagsaver(mpi_vendor='intel'))
+
+  def MockVmWithReturnValues(self):
+    # for use when calling intelmpi.py commands to find mpivars, MPI version
+    vm = MockVm()
+    vm_returns = [
+        self.MPIVARS_FILE,
+        ('Intel(R) MPI Library for Linux* OS, '
+         'Version 2018 Update 4 Build 20180823 (id: 18555)')
+    ]
+    vm.RemoteCommand.side_effect = [(txt, '') for txt in vm_returns]
+    return vm
+
+  def testInstallCompileSource(self) -> None:
+    vm = MockVm()
+    imb.Install(vm)
+    # TODO(user) taken out due to not installing MKL
+    # vm.InstallPackages.assert_called_with('intel-mkl-2020.1-102')
+    # just confirm that the git clone and patch were done
+    cmd = ';'.join([cmd[0][0] for cmd in vm.RemoteCommand.call_args_list])
+    self.assertRegex(
+        cmd, 'git clone -n https://github.com/intel/mpi-benchmarks.git',
+        'Missing git clone command')
+    self.assertRegex(cmd, 'patch -d mpi-benchmarks -p3 < ~/intelmpi.patch',
+                     'Missing patch command')
+
+  def testMpirunMpiVersion(self):
+    vm = self.MockVmWithReturnValues()
+
+    mpi_version = intelmpi.MpirunMpiVersion(vm)
+
+    self.assertEqual('2018.4', mpi_version)
+    vm.RemoteCommand.assert_called_with(f'. {self.MPIVARS_FILE}; mpirun -V')
+
+  def testMpirunMpiVersionError(self):
+    vm = MockVm()
+    vm.RemoteCommand.return_value = 'Non parsable text', ''
+
+    with self.assertRaises(ValueError):
+      intelmpi.MpirunMpiVersion(vm)
+
+  @parameterized.parameters((2, ' -ppn 1'), (4, ''))
+  def testPpn(self, total_processes, expected_suffix):
+    vm = self.MockVmWithReturnValues()
+    hosts = ['10.0.0.1', '10.0.0.2']
+
+    mpirun = imb.MpiRunCommand(vm, hosts, total_processes, 0, [], [], False)
+
+    # '-ppn 1' is only seen when running single threaded tests
+    expected_mpirun = (f'mpirun -n {total_processes} -hosts 10.0.0.1,10.0.0.2'
+                       f'{expected_suffix}')
+    self.assertEqual(f'. {self.MPIVARS_FILE}; {expected_mpirun}', mpirun)
+
+  @parameterized.parameters(
+      ('2019.6', COMPILE_2019, []),
+      ('2021.2', COMPILE_2021,
+       ['intel-oneapi-compiler-dpcpp-cpp', 'intel-oneapi-mpi-devel']))
+  def testInstall2021(self, intelmpi_version, expected_compile_cmd,
+                      installed_packages):
+    vm = MockVm()
+    with flagsaver.flagsaver(intelmpi_version=intelmpi_version):
+      imb.Install(vm)
+    vm.RemoteCommand.assert_any_call(expected_compile_cmd)
+    vm.InstallPackages.assert_has_calls(
+        [mock.call(pkb) for pkb in installed_packages])
+
+
+class OpenMpiLibTestCase(pkb_common_test_case.PkbCommonTestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.enter_context(flagsaver.flagsaver(mpi_vendor='openmpi'))
+
+  def testInstallCompileSource(self) -> None:
+    vm = MockVm()
+    imb.Install(vm)
+    cmd = ';'.join([cmd[0][0] for cmd in vm.RemoteCommand.call_args_list])
+    self.assertRegex(
+        cmd, 'git clone -n https://github.com/intel/mpi-benchmarks.git',
+        'Missing git clone command')
+    self.assertRegex(cmd, 'patch -d mpi-benchmarks -p3 < ~/intelmpi.patch',
+                     'Missing patch command')
+
+  @flagsaver.flagsaver(imb_compile_from_source=False)
+  def testInstallWithoutImbCompileFromSourceThrows(self) -> None:
+    vm = MockVm()
+    with self.assertRaises(ValueError) as e:
+      imb.Install(vm)
+    self.assertEqual(
+        str(e.exception),
+        '--mpi_vendor=openmpi requires --imb_compile_from_source')
+
+  def testMpiRunCommandEnvVarsExported(self):
+    vm = MockVm()
+    total_proc = 2
+    ppn = 1
+    hosts = ['10.0.0.1', '10.0.0.2']
+    environment = [
+        'OMPI_MCA_btl=self,tcp',
+        'OMPI_MCA_rmaps_base_mapping_policy=node:PE=1',
+    ]
+
+    mpirun = imb.MpiRunCommand(vm, hosts, total_proc, ppn, environment, [],
+                               False)
+
+    expected_mpirun = (
+        'OMPI_MCA_btl=self,tcp OMPI_MCA_rmaps_base_mapping_policy=node:PE=1 '
+        'mpirun -x OMPI_MCA_btl -x OMPI_MCA_rmaps_base_mapping_policy '
+        '-report-bindings -display-map -n 2 -npernode 1 --use-hwthread-cpus '
+        '-host 10.0.0.1:slots=2,10.0.0.2:slots=2')
+    self.assertEqual(expected_mpirun, mpirun)
+
+  def testMpiRunCommandNoEnvVarsIsFormattedCorrectly(self):
+    vm = MockVm()
+    total_proc = 2
+    ppn = 1
+    hosts = ['10.0.0.1', '10.0.0.2']
+    environment = []
+
+    mpirun = imb.MpiRunCommand(vm, hosts, total_proc, ppn, environment, [],
+                               False)
+
+    expected_mpirun = (
+        'mpirun -report-bindings -display-map -n 2 -npernode 1 '
+        '--use-hwthread-cpus -host 10.0.0.1:slots=2,10.0.0.2:slots=2')
+    self.assertEqual(expected_mpirun, mpirun)
+
+  def testMpiRunCommandNoPpnSpecified(self):
+    vm = MockVm()
+    total_proc = 8
+    ppn = 0
+    hosts = ['10.0.0.1', '10.0.0.2', '10.0.0.3', '10.0.0.4']
+    environment = []
+
+    mpirun = imb.MpiRunCommand(vm, hosts, total_proc, ppn, environment, [],
+                               False)
+    expected_mpirun = (
+        'mpirun -report-bindings -display-map -n 8 -npernode 2 '
+        '--use-hwthread-cpus -host '
+        '10.0.0.1:slots=8,10.0.0.2:slots=8,10.0.0.3:slots=8,10.0.0.4:slots=8')
+    self.assertEqual(expected_mpirun, mpirun)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tests/linux_packages/mpi_test.py b/tests/linux_packages/mpi_test.py
new file mode 100644
index 0000000000..87090e693d
--- /dev/null
+++ b/tests/linux_packages/mpi_test.py
@@ -0,0 +1,266 @@
+"""Tests for MPI benchmark."""
+
+import json
+import os
+from typing import Any, Dict, List, Union
+import unittest
+from unittest import mock
+import uuid
+from absl import flags
+from absl.testing import parameterized
+from perfkitbenchmarker import errors
+from perfkitbenchmarker.linux_packages import intelmpi
+from perfkitbenchmarker.linux_packages import mpi
+from perfkitbenchmarker.linux_packages import omb
+from tests import pkb_common_test_case
+
+FLAGS = flags.FLAGS
+
+_TEST_DIR = os.path.join(os.path.dirname(__file__), '..', 'data', 'mpi')
+MPI_VARS = '/opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh'
+# all "mpirun" commands start with this
+RUN_PREFIX = f'. {MPI_VARS};'
+
+
+def FilePath(file_name: str) -> str:
+  return os.path.join(_TEST_DIR, file_name)
+
+
+def ReadMpiOutput(file_name: str) -> str:
+  with open(FilePath(file_name)) as reader:
+    return reader.read()
+
+
+def ReadJson(file_name: str) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+  with open(FilePath(file_name)) as reader:
+    return json.load(reader)
+
+
+def _CreateMpiDataFromDict(data: Dict[str, Any]) -> mpi.MpiData:
+  if 'error' in data:
+    return mpi.MpiData(is_error=True, bytes=data['bytes'])
+  else:
+    number_bytes = data.pop('bytes', None)
+    repetitions = data.pop('repetitions', None)
+    return mpi.MpiData(
+        bytes=number_bytes, repetitions=repetitions, data=data['data'])
+
+
+def _CreateMpiResultsFromDict(result_json: Dict[str, Any]) -> mpi.MpiResult:
+  mpi_datas = [
+      _CreateMpiDataFromDict(mpi_data) for mpi_data in result_json['data']
+  ]
+  result_json['data'] = mpi_datas
+  if result_json.get('group_layout'):
+    # Convert json-serialized group number from string to an int
+    result_json['group_layout'] = {
+        int(key): value for key, value in result_json['group_layout'].items()
+    }
+  return mpi.MpiResult(**result_json)
+
+
+def _CreateMpiResponseFromDict(
+    data: List[Dict[str, Any]]) -> List[mpi.MpiResult]:
+  return [_CreateMpiResultsFromDict(result) for result in data]
+
+
+def ReadParsedOutput(file_name: str) -> List[mpi.MpiResult]:
+  return _CreateMpiResponseFromDict(ReadJson(file_name))
+
+
+def _MockVm(ip: str) -> mock.Mock:
+  vm = mock.Mock(internal_ip=ip)
+  vm.NumCpusForBenchmark.return_value = 8
+  return vm
+
+
+class MpiTestCase(pkb_common_test_case.PkbCommonTestCase):
+  MPI_VERSION = '2019.2-057'
+  # Lines from the -dumpfile latency file.  Format is (bytes, latency usec).
+  LATENCY_DATA_FILE = (
+      '0 1.123',
+      '0 2.9999',
+      '0 42.3',
+      '1024 2.0',
+      '1024 3.0',
+      '1024 3.0',
+  )
+
+  # The latency_data_file summarized in a Dict.
+  LATENCY_DATA: Dict[int, Dict[float, int]] = {
+      0: {
+          1.12: 1,
+          3.0: 1,
+          42.0: 1
+      },
+      1024: {
+          2.0: 1,
+          3.0: 2
+      }
+  }
+
+  def setUp(self):
+    super(MpiTestCase, self).setUp()
+    FLAGS.intelmpi_version = self.MPI_VERSION
+    self.enter_context(
+        mock.patch.object(intelmpi, 'MpiVars', return_value=MPI_VARS))
+
+  @parameterized.parameters(
+      # mpirun -n 120 -hosts a,b,c,d -ppn 1 mpi-benchmarks/....
+      ('mpi_allgather_output.txt', 'mpi_allgather_parsed.json'),
+      ('mpi_barrier_output.txt', 'mpi_barrier_parsed.json'),
+      ('mpi_pingpong_output.txt', 'mpi_pingpong_parsed.json'),
+      ('mpi_reduce_output.txt', 'mpi_reduce_parsed.json'),
+      ('mpi_latencies_output.txt', 'mpi_latencies_parsed.json'),
+      ('mpi_one_put_all_output.txt', 'mpi_one_put_all_parsed.json'),
+  )
+  def testParseMpiOutput(self, mpi_output_file: str,
+                         mpi_parsed_file: str) -> None:
+    found = list(
+        mpi.MpiResultParser(ReadMpiOutput(mpi_output_file).splitlines()))
+    expected = ReadParsedOutput(mpi_parsed_file)
+    self.assertEqual(expected, found)
+
+  def testVerifyInstall(self) -> None:
+    vms = [_MockVm(ip) for ip in ('a', 'b')]
+    vms[0].RobustRemoteCommand.return_value = '', ''
+    mpi.VerifyInstall(vms)
+    mpirun_cmd = ('mpirun -n 8 -hosts a,b -ppn 8 mpi-benchmarks/IMB-MPI1 '
+                  '-msglog 10:11 -multi 0 -time 20 -off_cache -1 -iter 100 '
+                  '-iter_policy off -zero_size off -show_tail yes PingPong')
+    vms[0].RobustRemoteCommand.assert_called_with(RUN_PREFIX + ' ' + mpirun_cmd)
+
+  def _CreateMpiRequest(self,
+                        record_latencies: bool,
+                        iterations: int = 100000) -> mpi.MpiRequest:
+    return mpi.MpiRequest(
+        vms=[_MockVm('a'), _MockVm('b')],
+        total_processes=10,
+        ppn=0,
+        suite='IMB-MPI1',
+        tests=['PingPong'],
+        msglog_min=10,
+        msglog_max=11,
+        timeout=20,
+        off_cache_size=-1,
+        off_cache_line_size=None,
+        iterations=iterations,
+        include_zero_byte=False,
+        compile_from_source=True,
+        record_latencies=record_latencies,
+        multi=True)
+
+  def testRunMpiStats(self) -> None:
+    vm = _MockVm('a')
+    vm.RobustRemoteCommand.return_value = ReadMpiOutput(
+        'mpi_pingpong_output.txt'), ''
+    request = self._CreateMpiRequest(False)
+    response = mpi.RunMpiStats(vm, request)
+    self.assertEqual(RUN_PREFIX + ' mpirun -n 10 -hosts a,b', response.mpi_run)
+    self.assertEqual('intel', response.vendor)
+    self.assertEqual('2019.2-057', response.version)
+    # fully tested in testParseFiles
+    self.assertLen(response.results, 1)
+    expected_args = ('mpi-benchmarks/IMB-MPI1 -msglog 10:11 -multi 0 -time 20 '
+                     '-off_cache -1 -iter 100000 -iter_policy off '
+                     '-zero_size off -show_tail yes -map 5x2 PingPong')
+    self.assertEqual(expected_args, response.args)
+
+  @mock.patch.object(mpi, '_GroupLatencyLines')
+  @mock.patch.object(uuid, 'uuid4', side_effect=[mock.PropertyMock(hex='abc')])
+  def testRunMpiStatsLatencyFile(self, mock_uuid: mock.Mock,
+                                 mock_create_histo: mock.Mock) -> None:
+    mock_create_histo.return_value = [[
+        '1024 10.0', '1024 11.0', '2048 11.10', '2048 11.11'
+    ]]
+    vm = _MockVm('a')
+    vm.RobustRemoteCommand.return_value = (
+        ReadMpiOutput('mpi_barrier_output.txt'), '')
+    request = self._CreateMpiRequest(True, 2)
+    response = mpi.RunMpiStats(vm, request)
+    # has the -show_tail and -dumpfile flags set
+    expected_args_re = (r'.*-zero_size off -show_tail yes '
+                        r'-dumpfile /tmp/latency\S+ -map 5x2 PingPong$')
+    self.assertRegex(response.args, expected_args_re)
+    mock_create_histo.assert_called_with(vm, '/tmp/latency-PingPong-abc.txt', 2)
+
+  @mock.patch('builtins.open',
+              mock.mock_open(read_data='\n'.join(LATENCY_DATA_FILE)))
+  def testGroupLatencyLines(self):
+    vm = mock.Mock()
+    vm.TryRemoteCommand.return_value = True
+    expected_group1 = ['0 1.123', '0 2.9999', '0 42.3']
+    expected_group2 = ['1024 2.0', '1024 3.0', '1024 3.0']
+    lines = mpi._GroupLatencyLines(vm, '/tmp/remote.txt', 3)
+    self.assertEqual([expected_group1, expected_group2], lines)
+    vm.TryRemoteCommand.assert_called_with('test -f /tmp/remote.txt')
+
+  def testGroupLatencyLinesMissingFile(self):
+    # method returns an empty list if check for remote latency file fails
+    vm = mock.Mock()
+    vm.TryRemoteCommand.return_value = False
+    lines = mpi._GroupLatencyLines(vm, '/tmp/remote.txt', 3)
+    self.assertEmpty(lines)
+
+  def testCreateMpiDataForHistogram(self) -> None:
+    FLAGS.run_uri = '12345678'
+    grouped_lines = [['1024 10.0', '1024 11.0', '2048 11.10', '2048 11.11']]
+    mpi_data1 = mpi.MpiData(
+        bytes=1024, repetitions=2, data={'p50': 10.5}, is_error=False)
+    mpi_data2 = mpi.MpiData(
+        bytes=2048, repetitions=2, data={'p50': 11.0}, is_error=False)
+    parsed_results = [
+        mpi.MpiResult(benchmark='PingPong', data=[mpi_data1, mpi_data2])
+    ]
+    self.assertIsNone(parsed_results[0].data[0].histogram)
+    self.assertIsNone(parsed_results[0].data[1].histogram)
+    mpi._CreateMpiDataForHistogram(grouped_lines, parsed_results)
+    # number of results did not change -- added "histogram=" entry to it
+    self.assertLen(parsed_results, 1)
+    self.assertEqual({10.0: 1, 11.0: 1}, parsed_results[0].data[0].histogram)
+    self.assertEqual({11.1: 2}, parsed_results[0].data[1].histogram)
+
+  def testCreateMpiDataForHistogramNoParsedResults(self) -> None:
+    # No parsed results -> no histograms are parsed
+    FLAGS.run_uri = '12345678'
+    grouped_lines = [['1024 10.0', '1024 11.0', '2048 11.10', '2048 11.11']]
+    parsed_results = []
+    self.assertLen(parsed_results, 0)
+    mpi._CreateMpiDataForHistogram(grouped_lines, parsed_results)
+    self.assertLen(parsed_results, 0)
+
+  def testRunMpiStatsWithException(self) -> None:
+    request = self._CreateMpiRequest(False)
+    vm = request.vms[0]
+    vm.RobustRemoteCommand.side_effect = [
+        errors.VirtualMachine.RemoteCommandError
+    ]
+    with self.assertRaises(errors.VirtualMachine.RemoteCommandError):
+      mpi.RunMpiStats(vm, request)
+    # pytyping thinks that vm.RemoteCommand is a Callable but it is a Mock
+    last_command = vm.RemoteCommand.call_args[0][0]  # pytype: disable=attribute-error
+    self.assertRegex(last_command, 'tail.*/var/log/')
+    vm.RemoteCommand.assert_called_once()  # pytype: disable=attribute-error
+
+  def testParseMpiPinning(self):
+    lines = ReadMpiOutput('mpi_debug_output.txt').splitlines()
+    # nodes 0 and 1 had the same MPI pinning groups of 0,1,2,3 CPUids
+    expected_pinning = ['0:0:0,1,2,3', '1:1:0,1,2,3']
+
+    self.assertEqual(expected_pinning, omb.ParseMpiPinning(lines))
+
+  def testParseMpiEnv(self):
+    lines = ReadMpiOutput('mpi_debug_output.txt').splitlines()
+    expected_mpi_env = {
+        'I_MPI_DEBUG': '5',
+        'I_MPI_HYDRA_TOPOLIB': 'hwloc',
+        'I_MPI_INTERNAL_MEM_POLICY': 'default',
+        'I_MPI_MPIRUN': 'mpirun',
+        'I_MPI_ROOT': '/opt/intel/compilers_and_libraries_2020.0.166/linux/mpi'
+    }
+
+    self.assertEqual(expected_mpi_env, mpi.ParseMpiEnv(lines))
+
+
+if __name__ == '__main__':
+  unittest.main()