Skip to content

Commit

Permalink
extend capabilities of read_raw_data (#84)
Browse files Browse the repository at this point in the history
* extend capabilities of read_raw_data

* possibility to read part of the file, with offset and partial_read
* choice of row/column major order

* testing + better error handling

* function will warn user if trying to pass inconsistent args
* function checks byte offset < file size

* Fixing style errors.

* get it shorter

* completing tests for codecov

* Fixing style errors.

* fix line length

* fix line length

* try to improve coverage

* finalize request

* remove blank lines
* iteration on dtype cleaner

* replace assert with raise error

* try to fool code cov

* fix indentation

* codecov didn't bite the bait

* fix error in testing
  • Loading branch information
raphaeldussin authored and rabernat committed Jul 16, 2018
1 parent 14c7338 commit 6b24a3c
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 33 deletions.
82 changes: 65 additions & 17 deletions xmitgcm/test/test_mds_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,33 +198,81 @@ def test_parse_meta(tmpdir):
for k, v in expected.items():
assert result[k] == v


def test_read_raw_data(tmpdir):
@pytest.mark.parametrize("dtype", [np.dtype('f8'), np.dtype('f4'), np.dtype('i4')])
def test_read_raw_data(tmpdir,dtype):
"""Check our utility for reading raw data."""

from xmitgcm.utils import read_raw_data
shape = (2, 4)
for dtype in [np.dtype('f8'), np.dtype('f4'), np.dtype('i4')]:
# create some test data
testdata = np.zeros(shape, dtype)
# write to a file
datafile = tmpdir.join("tmp.data")
datafile.write_binary(testdata.tobytes())
fname = str(datafile)
# now test the function
data = read_raw_data(fname, dtype, shape)
np.testing.assert_allclose(data, testdata)
# interestingly, memmaps are also ndarrays, but not vice versa
assert isinstance(data, np.ndarray) and not isinstance(data, np.memmap)
# check memmap
mdata = read_raw_data(fname, dtype, shape, use_mmap=True)
assert isinstance(mdata, np.memmap)
# create some test data
testdata = np.zeros(shape, dtype)
# write to a file
datafile = tmpdir.join("tmp.data")
datafile.write_binary(testdata.tobytes())
fname = str(datafile)
# now test the function
data = read_raw_data(fname, dtype, shape)
np.testing.assert_allclose(data, testdata)
# interestingly, memmaps are also ndarrays, but not vice versa
assert isinstance(data, np.ndarray) and not isinstance(data, np.memmap)
# check memmap
mdata = read_raw_data(fname, dtype, shape, use_mmap=True)
assert isinstance(mdata, np.memmap)

# make sure errors are correct
wrongshape = (2, 5)
with pytest.raises(IOError):
_ = read_raw_data(fname, dtype, wrongshape)

# test optional functionalities
shape = (5, 15, 10)
shape_subset = (15, 10)
testdata = np.zeros(shape, dtype)
# create some test data
x = np.arange(shape[0], dtype=dtype)
for k in np.arange(shape[0]):
testdata[k, :, :] = x[k]
# write to a file
datafile = tmpdir.join("tmp.data")
datafile.write_binary(testdata.tobytes())
fname = str(datafile)
# now test the function
for k in np.arange(shape[0]):
offset = (k * shape[1] * shape[2] * dtype.itemsize)
data = read_raw_data(fname, dtype, shape_subset,
offset=offset, partial_read=True)
np.testing.assert_allclose(data, testdata[k, :, :])
assert isinstance(data, np.ndarray) and not isinstance(
data, np.memmap)
# check memmap
mdata = read_raw_data(fname, dtype, shape_subset,
offset=offset, partial_read=True,
use_mmap=True)
assert isinstance(mdata, np.memmap)

# test it breaks when it should
with pytest.raises(IOError):
# read with wrong shape
_ = read_raw_data(fname, dtype, shape_subset,
offset=0, partial_read=False)
with pytest.raises(IOError):
_ = read_raw_data(fname, dtype, shape_subset,
offset=0, partial_read=False, use_mmap=True)
with pytest.raises(ValueError):
# use offset when trying to read global file
_ = read_raw_data(fname, dtype, shape_subset,
offset=4, partial_read=False)
with pytest.raises(ValueError):
_ = read_raw_data(fname, dtype, shape_subset,
offset=4, partial_read=False, use_mmap=True)
# offset is too big
with pytest.raises(ValueError):
_ = read_raw_data(fname, dtype, shape, offset=(
shape[0]*shape[1]*shape[2]*dtype.itemsize), partial_read=True)
with pytest.raises(ValueError):
_ = read_raw_data(fname, dtype, shape, offset=(
shape[0]*shape[1]*shape[2]*dtype.itemsize), partial_read=True,
use_mmap=True)

# a meta test
def test_file_hiding(all_mds_datadirs):
Expand Down
54 changes: 38 additions & 16 deletions xmitgcm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ def read_mds(fname, iternum=None, use_mmap=True, force_dict=True, endian='>',
return out


def read_raw_data(datafile, dtype, shape, use_mmap=False):
def read_raw_data(datafile, dtype, shape, use_mmap=False, offset=0,
order='C', partial_read=False):
"""Read a raw binary file and shape it.
PARAMETERS
Expand All @@ -209,32 +210,53 @@ def read_raw_data(datafile, dtype, shape, use_mmap=False):
Shape of the data
use_memmap : bool, optional
Whether to read the data using a numpy.memmap
offset : int, optional
Offset (in bytes) to apply on read
order : str, optional
Row/Column Major = 'C' or 'F'
partial_read : bool, optional
If reading part of the file
RETURNS
-------
data : numpy.ndarray
The data (or a memmap to it)
"""

#print("Reading raw data in %s" % datafile)
# first check to be sure there is the right number of bytes in the file
number_of_values = reduce(lambda x, y: x * y, shape)
expected_number_of_bytes = number_of_values * dtype.itemsize
actual_number_of_bytes = os.path.getsize(datafile)
if expected_number_of_bytes != actual_number_of_bytes:
raise IOError('File `%s` does not have the correct size '
'(expected %g, found %g)' %
(datafile,
expected_number_of_bytes,
actual_number_of_bytes))
if use_mmap:
# print("Reading %s using memmap" % datafile)
d = np.memmap(datafile, dtype, 'r')
if not partial_read:
# first check that partial_read and offset are used together
if offset != 0:
raise ValueError(
'When partial_read==False, offset will not be read')
# second check to be sure there is the right number of bytes in file
if expected_number_of_bytes != actual_number_of_bytes:
raise IOError('File `%s` does not have the correct size '
'(expected %g, found %g)' %
(datafile,
expected_number_of_bytes,
actual_number_of_bytes))
else:
# print("Reading %s using fromfile" % datafile)
d = np.fromfile(datafile, dtype)
d.shape = shape
return d
pass

if offset < actual_number_of_bytes:
pass
else:
raise ValueError('bytes offset %g is greater than file size %g' %
(offset, actual_number_of_bytes))

with open(datafile, 'rb') as f:
if use_mmap:
data = np.memmap(f, dtype=dtype, mode='r', offset=offset,
shape=tuple(shape), order=order)
else:
f.seek(offset)
data = np.fromfile(f, dtype=dtype, count=number_of_values)
data = data.reshape(shape, order=order)
data.shape = shape
return data


def parse_available_diagnostics(fname, layers={}):
Expand Down

0 comments on commit 6b24a3c

Please sign in to comment.