Skip to content

Commit

Permalink
Adding utilities to parse i-PI output files into more standard python…
Browse files Browse the repository at this point in the history
… / ase objects (i-pi#313)

* Added utilities to parse ipi output files, and also mention this in the docs
* Disable installation of tests, as it's broken!
  • Loading branch information
ceriottm authored Mar 8, 2024
1 parent c24bca1 commit e8601c4
Show file tree
Hide file tree
Showing 10 changed files with 277 additions and 14 deletions.
3 changes: 1 addition & 2 deletions docs/src/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ as the system has installed:

- Python version 3.6 or greater (starting from version 2.0, i-PI is not Python2
compatible)

- The Python numerical library NumPy
src/getting-started.rst-

See :ref:`runningsimulations` for more details on how to launch i-PI.

Additionally, most client codes will have their own requirements. Many
Expand Down
33 changes: 33 additions & 0 deletions docs/src/output-files.rst
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,36 @@ run
> python i-pi RESTART
Reading output files
--------------------

It can be useful to parse the output files of i-PI into a format that can
be readily manipulated in a custom Python script. To this end, i-PI provides
a few utilities. `ipi.read_output` that can be used to parse a property output
file, that returns data blocks as a dictionary of numpy array, and additional
information from the header (such as units, and the description of each
property) as a separate dictionary

.. code-block::
from ipi import read_output
data, info = read_output("simulation.out")
Trajectory files can be read with `ipi.read_trajectory`. This reads the
trajectory output into a list of `ase.Atoms` objects (hence this functionality
has a dependency on `ase`), converting positions and cell to angstrom, and
moving other properties to arrays and converting
them to ASE units (e.g. forces are in eV/Å). `extras` output files (that
contain either numerical data, or raw strings, returned by the driver code
in addition to energy and forces) can be processed by using `format='extras'`
and an option: the call will then return a dictionary with an entry having the
name of the type of extra (if present) and either a list of the raw strings,
or a numpy array with the data. A second dictionary entry contains the list
of step numbers.

.. code-block::
from ipi import read_trajectory
data = read_trajectory("simulation.dipoles", format="extras")
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
<output prefix='simulation'>
<properties stride='1' filename='out'> [ step, time{picosecond}, conserved, temperature{kelvin}, kinetic_cv, potential, pressure_cv{megapascal}] </properties>
<trajectory filename='pos' stride='20'> positions{angstrom} </trajectory>
<trajectory filename='for' stride='20'> forces </trajectory>
</output>
<total_steps> 100 </total_steps>
<prng>
Expand Down
14 changes: 13 additions & 1 deletion ipi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@
The i-PI package.
"""

__all__ = ["clients", "engine", "inputs", "interfaces", "utils", "ipi_global_settings"]
# expose some utility functions in a more direct way
from ipi.utils.parsing import read_output, read_trajectory

__all__ = [
"clients",
"engine",
"inputs",
"interfaces",
"utils",
"ipi_global_settings",
"read_output",
"read_trajectory",
]

ipi_global_settings = {"floatformat": "%16.8e"}
1 change: 1 addition & 0 deletions ipi/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"mathtools",
"prng",
"inputvalue",
"parsing",
"nmtransform",
"messages",
"softexit",
Expand Down
5 changes: 1 addition & 4 deletions ipi/utils/io/io_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,4 @@ def process_units(
atoms.names[:] = names
atoms.m[:] = masses

return {
"atoms": atoms,
"cell": cell,
}
return {"atoms": atoms, "cell": cell, "comment": comment}
2 changes: 1 addition & 1 deletion ipi/utils/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def banner():
| ################################# |
\__#_/ \____/ \____/ \_#__/
# _ _______ _____ #
# (_) |_ __ \|_ _| # -*- v 2.6.0 -*-
# (_) |_ __ \|_ _| # -*- v 2.6.3 -*-
# __ ______ | |__) | | | #
Y [ ||______|| ___/ | | # A Universal Force Engine
0 0 | | _| |_ _| |_ #
Expand Down
222 changes: 222 additions & 0 deletions ipi/utils/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
""" Utility functions to parse i-PI output files.
These are meant to be used in Python post-processing pipelines, so
trajectory files are read as ASE objects (assuming units to be
Angstrom and eV), and output files are read in a dictionary of
numpy arrays.
"""

import re
import numpy as np
from ipi.utils.units import unit_to_user

try:
import ase
except ImportError:
ase = None

__all__ = ["read_output", "read_trajectory"]


def read_output(filename):
"""Reads an i-PI output file and returns a dictionary with the properties in a tidy order,
and information on units and descriptions of the content.
Usage:
read_output("filename")
Returns:
values, info
values: a dictionary with the property names as keys, and the values as numpy arrays
info: a dictionary with the property names as keys and as values tuples of (units, description)
"""

# Regex pattern to match header lines and capture relevant parts
header_pattern = re.compile(
r"#\s*(column|cols\.)\s+(\d+)(?:-(\d+))?\s*-->\s*([^\s\{]+)(?:\{([^\}]+)\})?\s*:\s*(.*)"
)

# Reading the file
with open(filename, "r") as file:
lines = file.readlines()

header_lines = [line for line in lines if line.startswith("#")]
data_lines = [line for line in lines if not line.startswith("#") and line.strip()]

# Interprets properties
properties = {}
for line in header_lines:
match = header_pattern.match(line)
if match:
# Extracting matched groups
col_type, start_col, end_col, property_name, units, description = (
match.groups()
)
col_info = f"{start_col}-{end_col}" if end_col else start_col
properties[col_info] = {
"name": property_name,
"units": units,
"description": description,
}

# Parse data
values_dict = {}
info_dict = {}
for col_info, prop_info in properties.items():
# Initialize list to hold values for each property
values_dict[prop_info["name"]] = []
# Save units and description
info_dict[prop_info["name"]] = (prop_info["units"], prop_info["description"])

for line in data_lines:
values = line.split()
for column_info, prop_info in properties.items():
if "-" in column_info: # Multi-column property
start_col, end_col = map(
int, column_info.split("-")
) # 1-based indexing
prop_values = values[
start_col - 1 : end_col
] # Adjust to 0-based indexing
else: # Single column property
col_index = int(column_info) - 1 # Adjust to 0-based indexing
prop_values = [values[col_index]]

values_dict[prop_info["name"]].append([float(val) for val in prop_values])

for prop_name, prop_values in values_dict.items():
values_dict[prop_name] = np.array(
prop_values
).squeeze() # make 1-col into a flat array

return values_dict, info_dict


def read_trajectory(
filename,
format=None,
dimension="automatic",
units="automatic",
cell_units="automatic",
):
"""Reads a file in i-PI format and returns it in ASE format.
`format` can be `xyz` (i-PI formatted), `pdb`, `binary`, `json`, `ase`, and if not specified it'll
be inferred from the filename extension. `extras` will read a trajectory containing the additional
data returned from a calculator; will try to read as a float array, and if it fails it'll resort
to returning a list of raw strings.
units can be inferred from the file content, or specified with `dimension`, `units` and `cell_units`
"""

if ase is None:
raise ImportError(
"read_trajectory requires the `ase` package to return the structure in ASE format"
)

from ipi.utils.io import read_file

if format is None:
# tries to infer the file format
format = filename[filename.rfind(".") + 1 :]
if format not in ["xyz", "pdb", "binary", "json", "ase"]:
raise ValueError(f"Unrecognized file format: {format}")

file_handle = open(filename, "r")
bohr2angstrom = unit_to_user("length", "angstrom", 1.0)
comment_regex = re.compile(r"(\w+)\{([^}]+)\}")
step_regex = re.compile(r"Step:\s+(\d+)")

frames = []
while True:
try:
if format == "extras":
file = open(filename, "r")
step_regex = re.compile(r"#EXTRAS\((\w+)\)# *Step:\s+(\d+)")
step_list = []
data_list = []
data_frame = []
is_array = True
property_name = ""
for line in file:
matches = step_regex.findall(line)
if len(matches) > 0:
if len(data_frame) > 0:
try:
data_processed = np.loadtxt(data_frame)
except:
is_array = False
data_processed = "\n".join(data_frame)
data_list.append(data_processed)
data_frame = []

# Found a new step, update current_step and initialize the list for data
step_list.append(int(matches[0][1]))
if property_name == "":
property_name = matches[0][0]
elif property_name != matches[0][0]:
raise ValueError(
f"Inconsistent property {matches[0][0]} found in extras containing {property_name}"
)
else:
data_frame.append(line)

if len(data_frame) > 0:
try:
data_processed = np.loadtxt(data_frame)
except:
is_array = False
data_processed = "\n".join(data_frame)
data_list.append(data_processed)
if is_array:
data_list = np.array(data_list).squeeze()
return {
"step": np.asarray(step_list, dtype=int),
property_name: data_list,
}
else:
ret = read_file(
format,
file_handle,
dimension=dimension,
units=units,
cell_units=cell_units,
)

frame = ase.Atoms(
ret["atoms"].names,
positions=ret["atoms"].q.reshape((-1, 3)) * bohr2angstrom,
cell=ret["cell"].h.T * bohr2angstrom,
pbc=True,
)

# parse comment to get the property
matches = comment_regex.findall(ret["comment"])

# get what we have found
if len(matches) >= 2:
what = matches[-2][0]
else: # defaults to reading positions
what = "positions"

# ... and the step
matches = step_regex.findall(ret["comment"])
if len(matches) >= 1:
frame.info["step"] = int(matches[-1][0])

# if we have forces, set positions to zero (that data is missing!) and set forces instead
if what == "forces":
# set forces and convert to eV/angstrom
frame.positions *= 0
frame.arrays["forces"] = ret["atoms"].q.reshape(
(-1, 3)
) * unit_to_user("force", "ev/ang", 1.0)

frames.append(frame)

except EOFError:
break
except:
raise

return frames
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = ipi
version = 2.6.2
version = 2.6.3
description = A Python interface for ab initio path integral molecular dynamics simulations
long_description = file: README.md
long_description_content_type = text/x-rst
Expand Down
8 changes: 3 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,17 @@

from setuptools import setup, find_packages

# exclude i-pi-driver that is a compiled fortran file when installing from a local dir
scripts = [str(p) for p in Path("bin").iterdir() if p.name != "i-pi-driver"]

setup(
packages=[
*find_packages(exclude=["ipi_tests*"]),
"ipi._driver",
"ipi._driver.pes",
"ipi.tests",
"ipi.tests.regression_tests",
],
package_dir={
"ipi._driver": "drivers/py",
"ipi.tests": "ipi_tests",
},
package_data={"ipi.tests.regression_tests": ["tests/NVE/NVE_1/harmonic_python/*"]},
scripts=[str(p) for p in Path("bin").iterdir()],
scripts=scripts,
)

0 comments on commit e8601c4

Please sign in to comment.