Skip to content

Commit

Permalink
Add a Python parser for diag.out files and fix header formatting (#2666)
Browse files Browse the repository at this point in the history
This PR adds some Python code that can parse the files and sanitize any
duplicate entries (which often occurs when a run is restarted from an earlier
checkpoint). It also fixes some mistakes in the gravity_diag.out header
formatting, and cleans up some of the other header-writing code.

N.B.: Standard streams in C++ are right-justified by default (compared to
Python, where numbers are right-justified and strings are left-justified).

* update the header printing code for gravity_diag.out to match the others
   (the column numbers were 26 characters long instead of 25, and number 8
   was missing)

* simplify species formatting for species_diag.out (the previous code was just
   adding leading spaces)

* update the header field widths for amr_diag.out to match what's used for
   the data
  • Loading branch information
yut23 authored Dec 3, 2023
1 parent 02a2d50 commit 4f728c4
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 35 deletions.
61 changes: 26 additions & 35 deletions Source/driver/sum_integrated_quantities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ Castro::sum_integrated_quantities ()
header << std::endl;

data_log1 << std::setw(intwidth) << "# COLUMN 1";
data_log1 << std::setw(fixwidth) << " 2";
data_log1 << std::setw(fixwidth) << 2;

for (int icol = 3; icol <= n; ++icol) {
data_log1 << std::setw(datwidth) << icol;
Expand Down Expand Up @@ -482,28 +482,29 @@ Castro::sum_integrated_quantities ()

if (time == 0.0) {

log << std::setw(intwidth) << "# COLUMN 1";
log << std::setw(fixwidth) << " 2";
log << std::setw(fixwidth) << " 3";
log << std::setw(fixwidth) << " 4";
log << std::setw(fixwidth) << " 5";
log << std::setw(fixwidth) << " 6";
log << std::setw(fixwidth) << " 7";
int n = 0;

std::ostringstream header;

header << std::setw(intwidth) << "# TIMESTEP";
header << std::setw(fixwidth) << " TIME";
header << std::setw(intwidth) << "# TIMESTEP"; ++n;
header << std::setw(fixwidth) << " TIME"; ++n;

header << std::setw(datwidth) << " h_+ (x)";
header << std::setw(datwidth) << " h_x (x)";
header << std::setw(datwidth) << " h_+ (y)";
header << std::setw(datwidth) << " h_x (y)";
header << std::setw(datwidth) << " h_+ (z)";
header << std::setw(datwidth) << " h_x (z)";
header << std::setw(datwidth) << " h_+ (x)"; ++n;
header << std::setw(datwidth) << " h_x (x)"; ++n;
header << std::setw(datwidth) << " h_+ (y)"; ++n;
header << std::setw(datwidth) << " h_x (y)"; ++n;
header << std::setw(datwidth) << " h_+ (z)"; ++n;
header << std::setw(datwidth) << " h_x (z)"; ++n;

header << std::endl;

log << std::setw(intwidth) << "# COLUMN 1";
log << std::setw(fixwidth) << 2;

for (int i = 3; i <= n; ++i) {
log << std::setw(datwidth) << i;
}

log << std::endl;

log << header.str();
Expand Down Expand Up @@ -589,27 +590,17 @@ Castro::sum_integrated_quantities ()

std::ostringstream header;

header << std::setw(intwidth) << "# TIMESTEP"; ++n;
header << std::setw(fixwidth) << " TIME"; ++n;

// We need to be careful here since the species names have differing numbers of characters
header << std::setw(intwidth) << "# TIMESTEP"; ++n;
header << std::setw(fixwidth) << " TIME"; ++n;

for (int i = 0; i < NumSpec; i++) {
std::string outString{};
std::string massString{"Mass "};
std::string specString{species_names[i]};
while (static_cast<int>(outString.length() + specString.length() + massString.length()) < datwidth) {
outString += " ";
}
outString += massString;
outString += specString;
header << std::setw(datwidth) << outString; ++n;
header << std::setw(datwidth) << ("Mass " + species_names[i]); ++n;
}

header << std::endl;

log << std::setw(intwidth) << "# COLUMN 1";
log << std::setw(fixwidth) << " 2";
log << std::setw(fixwidth) << 2;

for (int i = 3; i <= n; ++i) {
log << std::setw(datwidth) << i;
Expand Down Expand Up @@ -694,19 +685,19 @@ Castro::sum_integrated_quantities ()
header << std::setw(fixwidth) << " DT"; ++n;
header << std::setw(intwidth) << " FINEST LEV"; ++n;
header << std::setw(fixwidth) << " MAX NUMBER OF SUBCYCLES"; ++n;
header << std::setw(fixwidth) << " COARSE TIMESTEP WALLTIME"; ++n;
header << std::setw(datwidth) << " COARSE TIMESTEP WALLTIME"; ++n;
#ifdef AMREX_USE_GPU
header << std::setw(fixwidth) << " MAXIMUM GPU MEMORY USED"; ++n;
header << std::setw(fixwidth) << " MINIMUM GPU MEMORY FREE"; ++n;
header << std::setw(datwidth) << " MAXIMUM GPU MEMORY USED"; ++n;
header << std::setw(datwidth) << " MINIMUM GPU MEMORY FREE"; ++n;
#endif

header << std::endl;

log << std::setw(intwidth) << "# COLUMN 1";
log << std::setw(fixwidth) << " 2";
log << std::setw(fixwidth) << 2;

for (int i = 3; i < 4; ++i) {
log << std::setw(datwidth) << i;
log << std::setw(fixwidth) << i;
}

log << std::setw(intwidth) << 4; // Handle the finest lev column
Expand Down
99 changes: 99 additions & 0 deletions Util/scripts/diag_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Helper functions for working with Castro diagnostic files (*_diag.out)
To use these in a standalone script, you can do one of the following:
* append $CASTRO_HOME/Util/scripts to sys.path at the top of your script:
sys.path.append("<path to Castro>/Util/scripts")
* add a symlink to this file in the same directory as your script:
$ ln -s "$CASTRO_HOME/Util/scripts/diag_parser.py" .
* copy this file into the same directory as your script
Then you can do `from diag_parser import deduplicate, read_diag_file`.
"""

from pathlib import Path

import numpy as np

""" Format notes
files are opened in Castro.cpp, data is written in sum_integrated_quantities.cpp
data_logs[0]: grid_diag.out
intwidth, fixwidth, datwidth*
data_logs[1]: gravity_diag.out
- this was previously missing the last column number (8), which we handle for
backwards compatibility
intwidth, fixwidth, datwidth, datwidth, datwidth, datwidth, datwidth, datwidth
data_logs[2]: species_diag.out
intwidth, fixwidth, datwidth*
data_logs[3]: amr_diag.out
- if compiled with GPU support, this will have two additional integer fields at
the end with size `datwidth` for the GPU memory usage
- column 5 (max number of subcycles) is an integer
intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth
"""

datwidth = 25 # Floating point data in scientific notation
fixwidth = 25 # Floating point data not in scientific notation
intwidth = 12 # Integer data

# Any additional columns after these are assumed to be floating point values in
# scientific notation (amr_diag.out gets special handling)
FIELD_WIDTHS = {
"grid_diag.out": [intwidth, fixwidth],
"gravity_diag.out": [intwidth, fixwidth] + [datwidth] * 6,
"species_diag.out": [intwidth, fixwidth],
"amr_diag.out": [intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth],
}


def read_diag_file(file_path):
"""Reads a Castro diagnostic file into a numpy structured array.
Currently only supports the default files that Castro generates.
"""
if not isinstance(file_path, Path):
file_path = Path(file_path)
filetype = file_path.name
if filetype not in FIELD_WIDTHS:
raise ValueError("Unsupported file name")
widths = FIELD_WIDTHS[filetype]
with open(file_path, "r") as f:
# try getting the number of columns from the first line
first_line = f.readline().rstrip("\n")
if filetype == "gravity_diag.out":
# gravity_diag.out is missing the last column number, but it
# fortunately has a fixed number of columns
num_columns = 8
else:
num_columns = int(first_line.split()[-1])
# pad out the widths list on the right if necessary
widths.extend([datwidth] * (num_columns - len(widths)))
# infer datatypes from the widths
dtypes = [int if w == intwidth else float for w in widths]
# amr_diag.out has several integer columns with long names
if filetype == "amr_diag.out":
dtypes[4] = int # max number of subcycles
if num_columns >= 8:
dtypes[6] = int # maximum gpu memory used
dtypes[7] = int # minimum gpu memory free
# already read the first header line, so we don't need to skip any rows
data = np.genfromtxt(
f, delimiter=widths, comments="#", dtype=dtypes, names=True
)
return data


def deduplicate(data):
"""Deduplicate based on the timestep, keeping the only last occurrence."""
# get the unique indices into the reversed timestep array, so we find the
# final occurrence of each timestep
_, rev_indices = np.unique(data["TIMESTEP"][::-1], return_index=True)
# np.unique() sorts by value, so we don't need to un-reverse rev_indices
unique_indices = data.shape[0] - rev_indices - 1
return data[unique_indices]

0 comments on commit 4f728c4

Please sign in to comment.