extend capabilities of read_raw_data (#84)

* extend capabilities of read_raw_data * possibility to read part of the file, with offset and partial_read * choice of row/column major order * testing + better error handling * function will warn user if trying to pass inconsistent args * function checks byte offset < file size * Fixing style errors. * get it shorter * completing tests for codecov * Fixing style errors. * fix line length * fix line length * try to improve coverage * finalize request * remove blank lines * iteration on dtype cleaner * replace assert with raise error * try to fool code cov * fix indentation * codecov didn't bite the bait * fix error in testing
MITgcm · Jul 16, 2018 · 6b24a3c · 6b24a3c
1 parent 14c7338
commit 6b24a3c
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 33 deletions.
diff --git a/xmitgcm/test/test_mds_store.py b/xmitgcm/test/test_mds_store.py
@@ -198,33 +198,81 @@ def test_parse_meta(tmpdir):
     for k, v in expected.items():
         assert result[k] == v
 
-
-def test_read_raw_data(tmpdir):
+@pytest.mark.parametrize("dtype", [np.dtype('f8'), np.dtype('f4'), np.dtype('i4')])
+def test_read_raw_data(tmpdir,dtype):
     """Check our utility for reading raw data."""
 
     from xmitgcm.utils import read_raw_data
     shape = (2, 4)
-    for dtype in [np.dtype('f8'), np.dtype('f4'), np.dtype('i4')]:
-        # create some test data
-        testdata = np.zeros(shape, dtype)
-        # write to a file
-        datafile = tmpdir.join("tmp.data")
-        datafile.write_binary(testdata.tobytes())
-        fname = str(datafile)
-        # now test the function
-        data = read_raw_data(fname, dtype, shape)
-        np.testing.assert_allclose(data, testdata)
-        # interestingly, memmaps are also ndarrays, but not vice versa
-        assert isinstance(data, np.ndarray) and not isinstance(data, np.memmap)
-        # check memmap
-        mdata = read_raw_data(fname, dtype, shape, use_mmap=True)
-        assert isinstance(mdata, np.memmap)
+    # create some test data
+    testdata = np.zeros(shape, dtype)
+    # write to a file
+    datafile = tmpdir.join("tmp.data")
+    datafile.write_binary(testdata.tobytes())
+    fname = str(datafile)
+    # now test the function
+    data = read_raw_data(fname, dtype, shape)
+    np.testing.assert_allclose(data, testdata)
+    # interestingly, memmaps are also ndarrays, but not vice versa
+    assert isinstance(data, np.ndarray) and not isinstance(data, np.memmap)
+    # check memmap
+    mdata = read_raw_data(fname, dtype, shape, use_mmap=True)
+    assert isinstance(mdata, np.memmap)
 
     # make sure errors are correct
     wrongshape = (2, 5)
     with pytest.raises(IOError):
         _ = read_raw_data(fname, dtype, wrongshape)
 
+    # test optional functionalities
+    shape = (5, 15, 10)
+    shape_subset = (15, 10)
+    testdata = np.zeros(shape, dtype)
+    # create some test data
+    x = np.arange(shape[0], dtype=dtype)
+    for k in np.arange(shape[0]):
+        testdata[k, :, :] = x[k]
+    # write to a file
+    datafile = tmpdir.join("tmp.data")
+    datafile.write_binary(testdata.tobytes())
+    fname = str(datafile)
+    # now test the function
+    for k in np.arange(shape[0]):
+        offset = (k * shape[1] * shape[2] * dtype.itemsize)
+        data = read_raw_data(fname, dtype, shape_subset,
+                             offset=offset, partial_read=True)
+        np.testing.assert_allclose(data, testdata[k, :, :])
+        assert isinstance(data, np.ndarray) and not isinstance(
+            data, np.memmap)
+        # check memmap
+        mdata = read_raw_data(fname, dtype, shape_subset,
+                              offset=offset, partial_read=True,
+                              use_mmap=True)
+        assert isinstance(mdata, np.memmap)
+
+        # test it breaks when it should
+        with pytest.raises(IOError):
+            # read with wrong shape
+            _ = read_raw_data(fname, dtype, shape_subset,
+                              offset=0, partial_read=False)
+        with pytest.raises(IOError):
+            _ = read_raw_data(fname, dtype, shape_subset,
+                              offset=0, partial_read=False, use_mmap=True)
+        with pytest.raises(ValueError):
+            # use offset when trying to read global file
+            _ = read_raw_data(fname, dtype, shape_subset,
+                              offset=4, partial_read=False)
+        with pytest.raises(ValueError):
+            _ = read_raw_data(fname, dtype, shape_subset,
+                              offset=4, partial_read=False, use_mmap=True)
+            # offset is too big
+        with pytest.raises(ValueError):
+            _ = read_raw_data(fname, dtype, shape, offset=(
+                shape[0]*shape[1]*shape[2]*dtype.itemsize), partial_read=True)
+        with pytest.raises(ValueError):
+            _ = read_raw_data(fname, dtype, shape, offset=(
+                shape[0]*shape[1]*shape[2]*dtype.itemsize), partial_read=True,
+                use_mmap=True)
 
 # a meta test
 def test_file_hiding(all_mds_datadirs):

diff --git a/xmitgcm/utils.py b/xmitgcm/utils.py
@@ -196,7 +196,8 @@ def read_mds(fname, iternum=None, use_mmap=True, force_dict=True, endian='>',
         return out
 
 
-def read_raw_data(datafile, dtype, shape, use_mmap=False):
+def read_raw_data(datafile, dtype, shape, use_mmap=False, offset=0,
+                  order='C', partial_read=False):
     """Read a raw binary file and shape it.
 
     PARAMETERS
@@ -209,32 +210,53 @@ def read_raw_data(datafile, dtype, shape, use_mmap=False):
         Shape of the data
     use_memmap : bool, optional
         Whether to read the data using a numpy.memmap
+    offset : int, optional
+        Offset (in bytes) to apply on read
+    order : str, optional
+        Row/Column Major = 'C' or 'F'
+    partial_read : bool, optional
+        If reading part of the file
 
     RETURNS
     -------
     data : numpy.ndarray
         The data (or a memmap to it)
     """
 
-    #print("Reading raw data in %s" % datafile)
-    # first check to be sure there is the right number of bytes in the file
     number_of_values = reduce(lambda x, y: x * y, shape)
     expected_number_of_bytes = number_of_values * dtype.itemsize
     actual_number_of_bytes = os.path.getsize(datafile)
-    if expected_number_of_bytes != actual_number_of_bytes:
-        raise IOError('File `%s` does not have the correct size '
-                      '(expected %g, found %g)' %
-                      (datafile,
-                       expected_number_of_bytes,
-                       actual_number_of_bytes))
-    if use_mmap:
-        # print("Reading %s using memmap" % datafile)
-        d = np.memmap(datafile, dtype, 'r')
+    if not partial_read:
+        # first check that partial_read and offset are used together
+        if offset != 0:
+            raise ValueError(
+                'When partial_read==False, offset will not be read')
+        # second check to be sure there is the right number of bytes in file
+        if expected_number_of_bytes != actual_number_of_bytes:
+            raise IOError('File `%s` does not have the correct size '
+                          '(expected %g, found %g)' %
+                          (datafile,
+                           expected_number_of_bytes,
+                           actual_number_of_bytes))
     else:
-        # print("Reading %s using fromfile" % datafile)
-        d = np.fromfile(datafile, dtype)
-    d.shape = shape
-    return d
+        pass
+
+    if offset < actual_number_of_bytes:
+        pass
+    else:
+        raise ValueError('bytes offset %g is greater than file size %g' %
+                         (offset, actual_number_of_bytes))
+
+    with open(datafile, 'rb') as f:
+        if use_mmap:
+            data = np.memmap(f, dtype=dtype, mode='r', offset=offset,
+                             shape=tuple(shape), order=order)
+        else:
+            f.seek(offset)
+            data = np.fromfile(f, dtype=dtype, count=number_of_values)
+            data = data.reshape(shape, order=order)
+    data.shape = shape
+    return data
 
 
 def parse_available_diagnostics(fname, layers={}):