First attempt at h5 file format

jbeilstenedmands · Feb 6, 2024 · ff3e1d3 · ff3e1d3
1 parent bf73681
commit ff3e1d3
Show file tree

Hide file tree

Showing 6 changed files with 249 additions and 1 deletion.
diff --git a/src/dials/array_family/boost_python/flex_ext.cc b/src/dials/array_family/boost_python/flex_ext.cc
@@ -31,6 +31,7 @@ namespace dials { namespace af { namespace boost_python {
   void export_flex_unit_cell();
   void export_flex_shoebox_extractor();
   void export_flex_binner();
+  void export_shoebox_extract();
 
   template <typename FloatType>
   std::string get_real_type();
@@ -55,6 +56,7 @@ namespace dials { namespace af { namespace boost_python {
     export_flex_unit_cell();
     export_flex_shoebox_extractor();
     export_flex_binner();
+    export_shoebox_extract();
 
     def("get_real_type", &get_real_type<ProfileFloatType>);
 

diff --git a/src/dials/array_family/boost_python/flex_reflection_table.cc b/src/dials/array_family/boost_python/flex_reflection_table.cc
@@ -760,6 +760,31 @@ namespace dials { namespace af { namespace boost_python {
     return data_bytes;
   }
 
+  boost::python::list get_shoebox_data_arrays(reflection_table self) {
+    boost::python::list result;
+    af::shared<Shoebox<> > sbox = self["shoebox"];
+    size_t n = 0;
+    for (int i = 0; i < self.size(); ++i) {
+      const Shoebox<> &s1 = sbox[i];
+      n += s1.data.size();
+    }
+    size_t ntot = 0;
+    af::shared<float> data_array(n, 0);
+    af::shared<float> bg_array(n, 0);
+    af::shared<size_t> mask_array(n, 0);
+    for (int i = 0; i < self.size(); ++i) {
+      const Shoebox<> &s1 = sbox[i];
+      std::copy(s1.data.begin(), s1.data.end(), data_array.begin() + ntot);
+      std::copy(s1.background.begin(), s1.background.end(), bg_array.begin() + ntot);
+      std::copy(s1.mask.begin(), s1.mask.end(), mask_array.begin() + ntot);
+      ntot += s1.data.size();
+    }
+    result.append(data_array);
+    result.append(bg_array);
+    result.append(mask_array);
+    return result;
+  }
+
   /**
    * Pack the reflection table in msgpack format into a streambuf object
    * @param self The reflection table
@@ -938,6 +963,7 @@ namespace dials { namespace af { namespace boost_python {
         .def("as_msgpack_to_file", &reflection_table_as_msgpack_to_file)
         .def("from_msgpack", &reflection_table_from_msgpack)
         .staticmethod("from_msgpack")
+        .def("get_shoebox_data_arrays", &get_shoebox_data_arrays)
         .def("experiment_identifiers", &T::experiment_identifiers)
         .def("select", &reflection_table_suite::select_rows_index<flex_table_type>)
         .def("select", &reflection_table_suite::select_rows_flags<flex_table_type>)

diff --git a/src/dials/array_family/boost_python/flex_shoebox_extractor.cc b/src/dials/array_family/boost_python/flex_shoebox_extractor.cc
@@ -34,4 +34,8 @@ namespace dials { namespace af { namespace boost_python {
       .def("npanals", &ShoeboxExtractor::npanels);
   }
 
+  void export_shoebox_extract() {
+    def("ShoeboxExtractFromData", &ShoeboxExtractFromData);
+  }
+
 }}}  // namespace dials::af::boost_python
diff --git a/src/dials/array_family/flex_ext.py b/src/dials/array_family/flex_ext.py
@@ -230,6 +230,8 @@ def as_file(self, filename):
         """
         if os.getenv("DIALS_USE_PICKLE"):
             self.as_pickle(filename)
+        elif os.getenv("DIALS_USE_H5"):
+            self.as_new_h5(filename)
         else:
             self.as_msgpack_file(filename)
 
@@ -243,7 +245,14 @@ def from_file(filename):
                 filename
             )
         except RuntimeError:
-            return dials_array_family_flex_ext.reflection_table.from_pickle(filename)
+            try:
+                return dials_array_family_flex_ext.reflection_table.from_new_h5(
+                    filename
+                )
+            except OSError:
+                return dials_array_family_flex_ext.reflection_table.from_pickle(
+                    filename
+                )
 
     @staticmethod
     def empty_standard(nrows):
@@ -375,6 +384,24 @@ def as_h5(self, filename):
         handle.set_reflections(self)
         handle.close()
 
+    def as_new_h5(self, filename):
+        from dials.util.nexus.nexus_new import NewNexusFile
+
+        handle = NewNexusFile(filename, "w")
+        # Clean up any removed experiments from the identifiers map
+        self.clean_experiment_identifiers_map()
+        handle.set_reflections(self)
+        handle.close()
+
+    @staticmethod
+    def from_new_h5(filename):
+        from dials.util.nexus.nexus_new import NewNexusFile
+
+        handle = NewNexusFile(filename, "r")
+        table = handle.get_reflections()
+        handle.close()
+        return table
+
     def as_miller_array(self, experiment, intensity="sum"):
         """Return a miller array with the chosen intensities.
 

diff --git a/src/dials/array_family/shoebox_extractor.h b/src/dials/array_family/shoebox_extractor.h
@@ -30,6 +30,30 @@ namespace dials { namespace af {
   using model::Shoebox;
   using model::Valid;
 
+  void ShoeboxExtractFromData(af::reflection_table data,
+                              af::shared<float> shoebox_data,
+                              af::shared<float> background_data,
+                              af::shared<size_t> mask_data) {
+    af::shared<Shoebox<> > shoebox_ = data["shoebox"];
+    std::vector<int> sizes(data.size());
+    for (int i = 0; i < data.size(); ++i) {
+      sizes[i] = shoebox_[i].data.size();
+    }
+    int n = 0;
+    for (int i = 0; i < data.size(); ++i) {
+      size_t size_ = sizes[i];
+      std::copy(shoebox_data.begin() + n,
+                shoebox_data.begin() + n + size_,
+                shoebox_[i].data.begin());
+      std::copy(background_data.begin() + n,
+                background_data.begin() + n + size_,
+                shoebox_[i].background.begin());
+      std::copy(
+        mask_data.begin() + n, mask_data.begin() + n + size_, shoebox_[i].mask.begin());
+      n += size_;
+    }
+  }
+
   /**
    * A class to extract shoebox pixels from images
    */

diff --git a/src/dials/util/nexus/nexus_new.py b/src/dials/util/nexus/nexus_new.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+import numpy as np
+
+from dxtbx import flumpy
+
+
+class ReflectionListEncoder(object):
+    """Encoder for the reflection data."""
+
+    def encode(self, reflections, handle):
+        """Encode the reflection data."""
+
+        # Create the reflection data group
+        group = handle.create_group("entry/data_processing")
+        group.attrs["num_reflections"] = reflections.size()
+        identifiers_group = handle.create_group("entry/experiment_identifiers")
+        identifiers = np.array(
+            [str(i) for i in reflections.experiment_identifiers().values()],
+            dtype="S",
+        )
+        identifiers_group.create_dataset(
+            "identifiers", data=identifiers, dtype=identifiers.dtype
+        )
+        ids = np.array(list(reflections.experiment_identifiers().keys()), dtype=int)
+        identifiers_group.create_dataset("ids", data=ids, dtype=ids.dtype)
+
+        self.encode_columns(group, reflections)
+
+    def encode_columns(self, group, table):
+        """Encode a column of data."""
+        from dials.array_family import flex
+
+        for key, data in table.cols():
+            if isinstance(data, flex.shoebox):
+                self.encode_shoebox(group, table)
+            elif isinstance(data, flex.int6):
+                s = data.size()
+                this_data = flumpy.to_numpy(data.as_int()).reshape(s, 6)
+                group.create_dataset(
+                    key, data=this_data, shape=this_data.shape, dtype=this_data.dtype
+                )
+            else:
+                this_data = flumpy.to_numpy(data)
+                group.create_dataset(
+                    key, data=this_data, shape=this_data.shape, dtype=this_data.dtype
+                )
+
+    def encode_shoebox(self, group, table):
+        """Encode a column of shoeboxes."""
+        sbdata, bg, mask = table.get_shoebox_data_arrays()
+        data = flumpy.to_numpy(sbdata)
+        group.create_dataset(
+            "shoebox_data", data=data, shape=data.shape, dtype=data.dtype
+        )
+        data = flumpy.to_numpy(bg)
+        group.create_dataset(
+            "shoebox_background", data=data, shape=data.shape, dtype=data.dtype
+        )
+        data = flumpy.to_numpy(mask)
+        group.create_dataset(
+            "shoebox_mask", data=data, shape=data.shape, dtype=data.dtype
+        )
+
+
+class ReflectionListDecoder(object):
+    """Decoder for the reflection data."""
+
+    def decode(self, handle):
+        """Decode the reflection data."""
+        import dials_array_family_flex_ext
+        from dials.array_family import flex
+
+        # Get the group containing the reflection data
+        g = handle["entry/data_processing"]
+
+        # Create the list of reflections
+        table = flex.reflection_table(int(g.attrs["num_reflections"]))
+        identifiers = handle["entry"]["experiment_identifiers"]["identifiers"][()]
+        ids = handle["entry"]["experiment_identifiers"]["ids"][()]
+        for id_, identifier in zip(ids, identifiers):
+            table.experiment_identifiers()[int(id_)] = str(identifier.decode())
+
+        # Decode all the columns
+        shoebox_data = None
+        shoebox_background = None
+        shoebox_mask = None
+        for key in g:
+            item = g[key]
+            if key == "shoebox_data":
+                shoebox_data = flumpy.from_numpy(np.array(g[key]))
+            elif key == "shoebox_background":
+                shoebox_background = flumpy.from_numpy(np.array(g[key]))
+            elif key == "shoebox_mask":
+                shoebox_mask = flumpy.from_numpy(np.array(g[key]))
+            else:
+                val = self._convert(key, item)
+                if val:
+                    table[str(key)] = val
+
+        if "panel" in table and "bbox" in table:
+            table["shoebox"] = flex.shoebox(
+                table["panel"], table["bbox"], allocate=True
+            )
+            if shoebox_mask and shoebox_background and shoebox_data:
+                dials_array_family_flex_ext.ShoeboxExtractFromData(
+                    table, shoebox_data, shoebox_background, shoebox_mask
+                )
+
+        # Return the list of reflections
+        return table
+
+    def _convert(self, key, data):
+        # Must allow that the data were written by a program outside of DIALS, so no special
+        # knowledge of flex types should be assumed.
+        from dials.array_family import flex
+
+        if key == "miller_index":  # special
+            assert len(data.shape) == 2 and data.shape[1] == 3
+            new = flumpy.miller_index_from_numpy(np.array(data))
+        elif len(data.shape) == 2:
+            if data.shape[1] == 3 or data.shape[1] == 2:  # vec3 or vec2 double
+                new = flumpy.vec_from_numpy(np.array(data))
+            elif data.shape[1] == 6:
+                new = flex.int6(flumpy.from_numpy(np.array(data).flatten()))
+                # N.B. flumpy could support this - but int6 only currently defined in dials, so would have to
+                # move that to dxtbx first.
+            else:
+                raise RuntimeError(
+                    "Unrecognised 2D data dimensions (expected shape (N,m)) where m is 2,3 or 6"
+                )
+        else:
+            new = flumpy.from_numpy(np.array(data))
+        return new
+
+
+class NewNexusFile:
+    """Interface to Nexus file."""
+
+    def __init__(self, filename, mode="a"):
+        """Open the file with the given mode."""
+        import h5py
+
+        self._handle = h5py.File(filename, mode)
+
+    def close(self):
+        """Close the file."""
+        self._handle.close()
+        del self._handle
+
+    def set_data(self, data, encoder):
+        """Set the model data using the supplied encoder."""
+        encoder.encode(data, self._handle)
+
+    def get_data(self, decoder):
+        """Get the model data using the supplied decoder."""
+        return decoder.decode(self._handle)
+
+    def set_reflections(self, reflections):
+        """Set the reflection data."""
+        self.set_data(reflections, ReflectionListEncoder())
+
+    def get_reflections(self):
+        """Get the reflection data."""
+        return self.get_data(ReflectionListDecoder())