Merge pull request #126 from y1zhou/feat/read-options

Add ReadOptions implementation
douweschulte · Jun 21, 2024 · 1298c60 · 1298c60
2 parents a21b17c + e82ec7f
commit 1298c60
Show file tree

Hide file tree

Showing 29 changed files with 2,533 additions and 309 deletions.
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
@@ -1,9 +1,10 @@
-use pdbtbx::*;
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::BufWriter;
 use std::time::{Duration, Instant};
 
+use pdbtbx::*;
+
 fn main() {
     // Setup the data needed
     let pdb_names = vec![
@@ -19,8 +20,11 @@ fn main() {
         ("big", "example-pdbs/pTLS-6484.cif"),
     ];
     let mut models = Vec::with_capacity(pdb_names.len());
+    let parser = ReadOptions::default()
+        .set_level(crate::StrictnessLevel::Loose)
+        .set_format(Format::Pdb);
     for (name, path) in &pdb_names {
-        models.push((*name, open_pdb(path, StrictnessLevel::Loose).unwrap().0))
+        models.push((*name, parser.read(path).unwrap().0))
     }
     let mut results = Vec::new();
 
@@ -56,7 +60,10 @@ fn main() {
 }
 
 fn bench_open(filename: &str) {
-    let (_pdb, _errors) = open(filename, StrictnessLevel::Loose).unwrap();
+    let (_pdb, _errors) = ReadOptions::default()
+        .set_level(crate::StrictnessLevel::Loose)
+        .read(filename)
+        .unwrap();
 }
 
 fn bench_transformation(mut pdb: PDB) {

diff --git a/example-pdbs/rosetta_model.cif b/example-pdbs/rosetta_model.cif
diff --git a/examples/selection.rs b/examples/selection.rs
@@ -1,7 +1,11 @@
 use pdbtbx::*;
 
 fn main() {
-    let (pdb, _errors) = open_pdb("example-pdbs/1ubq.pdb", StrictnessLevel::Loose).unwrap();
+    let (pdb, _errors) = ReadOptions::default()
+        .set_level(StrictnessLevel::Loose)
+        .set_format(Format::Pdb)
+        .read("example-pdbs/1ubq.pdb")
+        .unwrap();
 
     // Two ways of selecting the following atom in the PDB file, the first search can be somewhat faster
     // because it can discard other chains which the second search has to test.

diff --git a/examples/sphere.rs b/examples/sphere.rs
@@ -1,16 +1,22 @@
-use pdbtbx::*;
 use rayon::iter::ParallelIterator;
 
+use pdbtbx::*;
+
 fn main() {
-    atom_sphere();
-    residue_sphere();
-    find_clashes();
+    let (pdb, _errors) = ReadOptions::new()
+        .set_level(StrictnessLevel::Loose)
+        .set_format(Format::Pdb)
+        .read("example-pdbs/1ubq.pdb")
+        .unwrap();
+
+    atom_sphere(&pdb);
+    residue_sphere(&pdb);
+    find_clashes(&pdb);
 }
 
 /// Find all Atoms in a sphere around a single origin Atom with a user-defined radius
 /// This is using the features `rstar` and `rayon`.
-fn atom_sphere() {
-    let (pdb, _errors) = open_pdb("example-pdbs/1ubq.pdb", StrictnessLevel::Loose).unwrap();
+fn atom_sphere(pdb: &PDB) {
     let (origin_id, radius): (usize, f64) = (12, 3.5);
 
     // Leverage parallel searching
@@ -31,8 +37,7 @@ fn atom_sphere() {
 /// Find all Atoms belonging to a Residue that has at least one Atom within a sphere of
 /// user-defined origin and radius.
 /// This is using the features `rstar` and `rayon`.
-fn residue_sphere() {
-    let (pdb, _errors) = open_pdb("example-pdbs/1ubq.pdb", StrictnessLevel::Loose).unwrap();
+fn residue_sphere(pdb: &PDB) {
     let (origin_id, radius): (usize, f64) = (12, 3.5);
 
     let sphere_origin = pdb
@@ -72,8 +77,7 @@ fn residue_sphere() {
 /// Results for Atoms within the same Residue are excluded as well as those from the C and N Atoms
 /// constituting the peptide bond of neighbouring amino acids.
 /// Also, Atoms are not counted twice.
-fn find_clashes() {
-    let (pdb, _errors) = open_pdb("example-pdbs/1ubq.pdb", StrictnessLevel::Loose).unwrap();
+fn find_clashes(pdb: &PDB) {
     let tree = pdb.create_hierarchy_rtree();
 
     let mut clashing_atoms = Vec::new();

diff --git a/examples/waterbox.rs b/examples/waterbox.rs
@@ -1,8 +1,9 @@
-use pdbtbx::*;
 use std::env;
 use std::path::Path;
 use std::time::Instant;
 
+use pdbtbx::*;
+
 fn main() {
     let filename = env::current_dir()
         .unwrap()
@@ -23,8 +24,11 @@ fn main() {
 fn create_waterbox(size: (f64, f64, f64)) -> PDB {
     let now = Instant::now();
 
-    let (mut liquid, _errors) =
-        open_pdb("example-pdbs/liquid.pdb", StrictnessLevel::Loose).unwrap();
+    let (mut liquid, _errors) = ReadOptions::new()
+        .set_level(StrictnessLevel::Loose)
+        .set_format(Format::Pdb)
+        .read("example-pdbs/liquid.pdb")
+        .unwrap();
 
     let time = now.elapsed();
 

diff --git a/src/error/context.rs b/src/error/context.rs
@@ -110,7 +110,7 @@ impl Context {
         } else {
             Context::Line {
                 linenumber: pos.line,
-                line: pos.text.lines().into_iter().next().unwrap().to_string(),
+                line: pos.text.lines().next().unwrap().to_string(),
                 offset: 0,
                 length: 3,
             }
@@ -132,7 +132,6 @@ impl Context {
                 lines: start
                     .text
                     .lines()
-                    .into_iter()
                     .take(end.line - start.line)
                     .map(ToString::to_string)
                     .collect::<Vec<String>>(),

diff --git a/src/lib.rs b/src/lib.rs
@@ -24,10 +24,7 @@
 //!
 //! ```rust
 //! use pdbtbx::*;
-//! let (mut pdb, _errors) = pdbtbx::open(
-//!         "example-pdbs/1ubq.pdb",
-//!         StrictnessLevel::Medium
-//!     ).unwrap();
+//! let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1ubq.pdb").unwrap();
 //!
 //! pdb.remove_atoms_by(|atom| atom.element() == Some(&Element::H)); // Remove all H atoms
 //!
@@ -65,15 +62,15 @@
     doc = r##"
 ```rust
 use pdbtbx::*;
-let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1ubq.pdb", pdbtbx::StrictnessLevel::Medium).unwrap();
+let (mut pdb, _errors) = pdbtbx::open("example-pdbs/1ubq.pdb").unwrap();
 // You can loop over all atoms within 3.5 Aͦ of a specific atom
 // Note: The `locate_within_distance` method takes a squared distance
 let tree = pdb.create_atom_rtree();
 for atom in tree.locate_within_distance(pdb.atom(42).unwrap().pos(), 3.5 * 3.5) {
     println!("{}", atom);
 }
 
-// You can even get information about the hierarchy of these atoms 
+// You can even get information about the hierarchy of these atoms
 // (the chain, residue and conformer that contain this atom)
 let tree = pdb.create_hierarchy_rtree();
 let mut total = 0;

diff --git a/src/read/general.rs b/src/read/general.rs
@@ -1,52 +1,29 @@
-use std::io::{BufRead, Read, Seek};
-
-use super::*;
 use crate::error::*;
 use crate::structs::PDB;
 use crate::StrictnessLevel;
 
-#[cfg(feature = "compression")]
-use super::mmcif::open_mmcif_bufread;
-#[cfg(feature = "compression")]
-use flate2::read::GzDecoder;
-#[cfg(feature = "compression")]
-use std::fs;
+use super::*;
 
 /// Standard return type for reading a file.
-pub type ReadResult = std::result::Result<(PDB, Vec<PDBError>), Vec<PDBError>>;
+pub type ReadResult = Result<(PDB, Vec<PDBError>), Vec<PDBError>>;
 
-/// Open an atomic data file, either PDB or mmCIF/PDBx. The correct type will be
-/// determined based on the file extension. This function is equivalent to
-/// [`ReadOptions::read()`] with default options, apart from the `level` which
-/// can be set by the `level` parameter.
+/// Open an atomic data file, either PDB or mmCIF/PDBx.
+///
+/// This function is equivalent to [`ReadOptions::read()`] with default options.
+/// The correct type will be determined based on the file extension.
+/// Gzipped files can also be opened directly if file extensions are
+/// `.pdb.gz`, `.pdb1.gz`, `.mmcif.gz`, or `.cif.gz`.
 ///
 /// # Errors
 /// Returns a `PDBError` if a `BreakingError` is found. Otherwise it returns the PDB with all errors/warnings found while parsing it.
 ///
 /// # Related
-/// If you want to open a file from memory see [`open_raw`]. There are also function to open a specified file type directly
-/// see [`crate::open_pdb`] and [`crate::open_mmcif`] respectively.
-pub fn open(filename: impl AsRef<str>, level: StrictnessLevel) -> ReadResult {
-    open_with_options(filename, &ReadOptions::new().set_level(level))
-}
-
-/// Opens a files based on the given options.
-pub(in crate::read) fn open_with_options(
-    filename: impl AsRef<str>,
-    options: &ReadOptions,
-) -> ReadResult {
-    if check_extension(&filename, "pdb") {
-        open_pdb(filename, options.level)
-    } else if check_extension(&filename, "cif") {
-        open_mmcif(filename, options.level)
-    } else {
-        Err(vec![PDBError::new(
-            ErrorLevel::BreakingError,
-            "Incorrect extension",
-            "Could not determine the type of the given file, make it .pdb or .cif",
-            Context::show(filename.as_ref()),
-        )])
-    }
+/// If you want to open a file from memory see [`ReadOptions::read_raw`].
+/// The file type can be set explicitly with [`ReadOptions::set_format`].
+/// These functions are useful if you are using a non-standard compression algorithm or way of
+/// storing the data.
+pub fn open(filename: impl AsRef<str>) -> ReadResult {
+    ReadOptions::default().read(filename)
 }
 
 /// Open a compressed atomic data file, either PDB or mmCIF/PDBx. The correct type will be
@@ -56,104 +33,20 @@ pub(in crate::read) fn open_with_options(
 /// Returns a `PDBError` if a `BreakingError` is found. Otherwise it returns the PDB with all errors/warnings found while parsing it.
 ///
 /// # Related
-/// If you want to open a file from memory see [`open_raw`], [`crate::open_pdb_raw`] and [`crate::open_mmcif_bufread`].
+/// If you want to open a file from memory see [`ReadOptions::read_raw`].
+/// The file type can be set explicitly with [`ReadOptions::set_format`].
 /// These functions are useful if you are using a non-standard compression algorithm or way of
 /// storing the data.
 #[cfg(feature = "compression")]
+#[deprecated(
+    since = "0.12.0",
+    note = "Please use `ReadOptions::default().set_decompress(true).read(filename)` instead"
+)]
 pub fn open_gz(filename: impl AsRef<str>, level: StrictnessLevel) -> ReadResult {
-    let filename = filename.as_ref();
-
-    if check_extension(filename, "gz") {
-        // open a decompression stream
-        let file = fs::File::open(filename).map_err(|_| {
-            vec![PDBError::new(
-                ErrorLevel::BreakingError,
-                "Could not open file",
-                "Could not open the given file, make sure it exists and you have the correct permissions",
-                Context::show(filename),
-            )]
-        })?;
-
-        let decompressor = GzDecoder::new(file);
-
-        let reader = std::io::BufReader::new(decompressor);
-
-        if check_extension(&filename[..filename.len() - 3], "pdb") {
-            open_pdb_raw(reader, Context::show(filename), level)
-        } else if check_extension(&filename[..filename.len() - 3], "cif") {
-            open_mmcif_bufread(reader, level)
-        } else {
-            Err(vec![PDBError::new(
-                ErrorLevel::BreakingError,
-                "Incorrect extension",
-                "Could not determine the type of the given file, make it .pdb.gz or .cif.gz",
-                Context::show(filename),
-            )])
-        }
-    } else {
-        Err(vec![PDBError::new(
-            ErrorLevel::BreakingError,
-            "Incorrect extension",
-            "Could not determine the type of the given file, make it .pdb.gz or .cif.gz",
-            Context::show(filename),
-        )])
-    }
-}
-
-/// Open a stream with either PDB or mmCIF data. The distinction is made on the start of the first line.
-/// If it starts with `HEADER` it is a PDB file, if it starts with `data_` it is a mmCIF file.
-///
-/// # Errors
-/// Returns a `PDBError` if a `BreakingError` is found. Otherwise it returns the PDB with all errors/warnings found while parsing it.
-/// It returns a breaking error if the buffer could not be read, the file type could not be determined form the first line, or there was a breaking error in the file itself.
-/// See the `PDBError` for more details.
-///
-/// # Related
-/// If you want to open a file see [`open`]. There are also function to open a specified file type directly
-/// see [`crate::open_pdb_raw`] and [`crate::open_mmcif_raw`] respectively.
-pub fn open_raw<T: std::io::Read + std::io::Seek>(
-    mut input: std::io::BufReader<T>,
-    level: StrictnessLevel,
-) -> ReadResult {
-    let mut first_line = String::new();
-    if input.read_line(&mut first_line).is_err() {
-        return Err(vec![PDBError::new(
-            ErrorLevel::BreakingError,
-            "Buffer could not be read",
-            "The buffer provided to `open_raw` could not be read.",
-            Context::None,
-        )]);
-    }
-    if input.rewind().is_err() {
-        return Err(vec![PDBError::new(
-            ErrorLevel::BreakingError,
-            "Buffer could not be read",
-            "The buffer provided to `open_raw` could not be rewound to the start.",
-            Context::None,
-        )]);
-    }
-    if first_line.starts_with("HEADER") {
-        open_pdb_raw(input, Context::None, level)
-    } else if first_line.starts_with("data_") {
-        let mut contents = String::new();
-        if input.read_to_string(&mut contents).is_ok() {
-            open_mmcif_raw(&contents, level)
-        } else {
-            Err(vec![PDBError::new(
-                ErrorLevel::BreakingError,
-                "Buffer could not be read",
-                "The buffer provided to `open_raw` could not be read to end.",
-                Context::show(&first_line),
-            )])
-        }
-    } else {
-        Err(vec![PDBError::new(
-            ErrorLevel::BreakingError,
-            "Could not determine file type",
-            "Could not determine the type of the given file, make it .pdb or .cif",
-            Context::show(&first_line),
-        )])
-    }
+    ReadOptions::default()
+        .set_level(level)
+        .guess_format(filename.as_ref())
+        .read(filename)
 }
 
 #[cfg(test)]
@@ -162,19 +55,17 @@ mod tests {
 
     #[test]
     fn open_invalid() {
-        assert!(open("file.png", StrictnessLevel::Medium).is_err());
-        assert!(open("file.mmcif", StrictnessLevel::Medium).is_err());
-        assert!(open("file.pdbml", StrictnessLevel::Medium).is_err());
-        assert!(open("file.pd", StrictnessLevel::Medium).is_err());
+        assert!(open("file.png").is_err());
+        assert!(open("file.mmcif").is_err());
+        assert!(open("file.pdbml").is_err());
+        assert!(open("file.pd").is_err());
     }
 
     #[test]
     fn open_not_existing() {
-        let pdb =
-            open("file.pdb", StrictnessLevel::Medium).expect_err("This file should not exist.");
+        let pdb = open("file.pdb").expect_err("This file should not exist.");
         assert_eq!(pdb[0].short_description(), "Could not open file");
-        let cif =
-            open("file.cif", StrictnessLevel::Medium).expect_err("This file should not exist.");
+        let cif = open("file.cif").expect_err("This file should not exist.");
         assert_eq!(cif[0].short_description(), "Could not open file");
     }
 }