Some refactoring

broadinstitute · May 31, 2024 · a059832 · a059832
1 parent 7c44b4d
commit a059832
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 24 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/hidive/Cargo.toml b/src/hidive/Cargo.toml
@@ -13,8 +13,8 @@ clap = { version = "4.5.1", features = ["derive"] }
 eyre = "0.6.8"
 needletail = "*"
 noodles = "*"
-parquet = "*"
 path-absolutize = "3.1.1"
+parquet = "*"
 petgraph = "*"
 #poasta = { path = "/Users/kiran/repositories/poasta" }
 #polars = "*"

diff --git a/src/hidive/src/fetch.rs b/src/hidive/src/fetch.rs
@@ -1,13 +1,9 @@
 // Import necessary standard library modules
 use std::path::PathBuf;
-use std::collections::HashSet;
 
 // Import the Absolutize trait to convert relative paths to absolute paths
 use path_absolutize::Absolutize;
 
-// Import the Url type to work with URLs
-use url::Url;
-
 // Import the skydive module, which contains the necessary functions for staging data
 use skydive;
 
@@ -25,20 +21,7 @@ use skydive;
 /// Panics if any locus in `loci_list` cannot be parsed.
 pub fn start(output: &PathBuf, loci_list: &Vec<String>, bam_paths: &Vec<PathBuf>, require_spanning_reads: bool) {
     let loci = skydive::utils::parse_loci(loci_list);
-
-    // Convert the list of BAM file paths into a HashSet of URLs
-    let reads_urls: HashSet<Url> = bam_paths
-        .iter()
-        // Use filter_map to attempt to parse each path as a URL, and collect the successful ones
-        .filter_map(|path| {
-            let path_str = path.to_string_lossy();
-            if path_str.starts_with("gs://") {
-                Url::parse(&path_str).ok()
-            } else {
-                Url::from_file_path(path.absolutize().unwrap()).ok()
-            }
-        })
-        .collect();
+    let reads_urls = skydive::utils::parse_file_names(bam_paths);
 
     // Get the system's temporary directory path
     let cache_path = std::env::temp_dir();

diff --git a/src/skydive/Cargo.toml b/src/skydive/Cargo.toml
@@ -27,6 +27,7 @@ memoffset = "0.9.0"
 ndarray = { version = "0.15.6", features = ["rayon"] }
 needletail = "*"
 openssl = { version = "0.10", features = ["vendored"] }
+path-absolutize = "3.1.1"
 parquet = "*"
 petgraph = "*"
 #polars = { version = "*", features = ["parquet", "lazy", "csv-file", "strings", "temporal", "dtype-duration", "dtype-categorical", "concat_str", "list", "list_eval", "rank", "lazy_regex"]}

diff --git a/src/skydive/src/stage.rs b/src/skydive/src/stage.rs
@@ -6,6 +6,7 @@ use rust_htslib::bam::ext::BamRecordExtensions;
 use std::collections::{HashMap, HashSet};
 use std::env;
 use std::path::PathBuf;
+use std::io::{self, Write};
 use linear_map::LinearMap;
 
 // Import the Url type to work with URLs.
@@ -19,6 +20,7 @@ use gag::Gag;
 
 // Import rayon's parallel iterator traits.
 use rayon::prelude::*;
+use rayon::iter::{ParallelIterator, IntoParallelRefIterator};
 
 // Import types from rust_htslib for working with BAM files.
 use rust_htslib::bam::{ self, Header, IndexedReader, Read };
@@ -102,6 +104,7 @@ fn stage_data_from_all_files(
     loci: &HashSet<(String, u64, u64)>,
     cache_path: &PathBuf,
 ) -> Result<Vec<(Vec<LinearMap<std::string::String, std::string::String>>, Vec<rust_htslib::bam::Record>)>> {
+
     // Use a parallel iterator to process multiple BAM files concurrently.
     let all_data: Vec<_> = reads_urls
         .par_iter()
@@ -153,7 +156,7 @@ pub fn stage_data(
     loci: &HashSet<(String, u64, u64)>,
     reads_urls: &HashSet<Url>,
     cache_path: &PathBuf,
-    require_spanning_reads: bool
+    require_spanning_reads: bool,
 ) -> Result<()> {
     // Disable stderr from trying to open an IndexedReader a few times, so
     // that the Jupyter notebook user doesn't get confused by intermediate

diff --git a/src/skydive/src/utils.rs b/src/skydive/src/utils.rs
@@ -1,6 +1,9 @@
-use std::collections::HashSet;
+use std::{collections::HashSet, io::BufRead, path::PathBuf};
 
 use anyhow::Result;
+use url::Url;
+
+use path_absolutize::Absolutize;
 
 pub fn parse_loci(loci_list: &Vec<String>) -> HashSet<(String, u64, u64)> {
     // Initialize a HashSet to store unique loci after parsing
@@ -58,4 +61,60 @@ pub fn parse_locus(locus: String) -> Result<(String, u64, u64)> {
     } else {
         anyhow::bail!("Locus format for '{}' is incorrect. It should be 'chr:start[-stop]'.", locus);
     }
-}
+}
+
+pub fn parse_file_names(bam_paths: &Vec<PathBuf>) -> HashSet<Url> {
+    // Convert the list of BAM file paths into a HashSet of URLs
+    let mut reads_urls: HashSet<Url> = bam_paths
+        .iter()
+        // Use filter_map to attempt to parse each path as a URL, and collect the successful ones
+        .filter_map(|path| {
+            let path_str = path.to_string_lossy();
+            if path_str.starts_with("gs://") {
+                Url::parse(&path_str).ok()
+            } else {
+                Url::from_file_path(path.absolutize().unwrap()).ok()
+            }
+        })
+        .collect();
+
+    // If any of the files are a local file ending in .txt, assume it's a file of filenames.
+    let mut local_file_contents = HashSet::new();
+    let mut to_remove = HashSet::new();
+    for url in &reads_urls {
+        if url.scheme() == "file" {
+            let path = url.to_file_path().unwrap();
+            if path.extension().and_then(std::ffi::OsStr::to_str) == Some("txt") {
+                if let Ok(file) = std::fs::File::open(&path) {
+                    let reader = std::io::BufReader::new(file);
+                    for line in reader.lines() {
+                        if let Ok(line_path) = line {
+                            let abs_path = PathBuf::from(line_path);
+                            local_file_contents.insert(abs_path);
+                        }
+                    }
+                }
+
+                to_remove.insert(url.clone());
+            }
+        }
+    }
+
+    // Remove FOFN files from the set of BAM/CRAM files.
+    to_remove.iter().for_each(|url| { let _ = reads_urls.remove(url); });
+
+    // Add the files from the file of filenames to the full list of files.
+    reads_urls.extend(local_file_contents
+        .into_iter()
+        .filter_map(|path| {
+            let path_str = path.to_string_lossy();
+            if path_str.starts_with("gs://") {
+                Url::parse(&path_str).ok()
+            } else {
+                Url::from_file_path(path.absolutize().unwrap()).ok()
+            }
+        })
+    );
+
+    reads_urls
+}