Skip to content

Commit

Permalink
Some refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
kvg committed May 31, 2024
1 parent 7c44b4d commit a059832
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 24 deletions.
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/hidive/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ clap = { version = "4.5.1", features = ["derive"] }
eyre = "0.6.8"
needletail = "*"
noodles = "*"
parquet = "*"
path-absolutize = "3.1.1"
parquet = "*"
petgraph = "*"
#poasta = { path = "/Users/kiran/repositories/poasta" }
#polars = "*"
Expand Down
19 changes: 1 addition & 18 deletions src/hidive/src/fetch.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@
// Import necessary standard library modules
use std::path::PathBuf;
use std::collections::HashSet;

// Import the Absolutize trait to convert relative paths to absolute paths
use path_absolutize::Absolutize;

// Import the Url type to work with URLs
use url::Url;

// Import the skydive module, which contains the necessary functions for staging data
use skydive;

Expand All @@ -25,20 +21,7 @@ use skydive;
/// Panics if any locus in `loci_list` cannot be parsed.
pub fn start(output: &PathBuf, loci_list: &Vec<String>, bam_paths: &Vec<PathBuf>, require_spanning_reads: bool) {
let loci = skydive::utils::parse_loci(loci_list);

// Convert the list of BAM file paths into a HashSet of URLs
let reads_urls: HashSet<Url> = bam_paths
.iter()
// Use filter_map to attempt to parse each path as a URL, and collect the successful ones
.filter_map(|path| {
let path_str = path.to_string_lossy();
if path_str.starts_with("gs://") {
Url::parse(&path_str).ok()
} else {
Url::from_file_path(path.absolutize().unwrap()).ok()
}
})
.collect();
let reads_urls = skydive::utils::parse_file_names(bam_paths);

// Get the system's temporary directory path
let cache_path = std::env::temp_dir();
Expand Down
1 change: 1 addition & 0 deletions src/skydive/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ memoffset = "0.9.0"
ndarray = { version = "0.15.6", features = ["rayon"] }
needletail = "*"
openssl = { version = "0.10", features = ["vendored"] }
path-absolutize = "3.1.1"
parquet = "*"
petgraph = "*"
#polars = { version = "*", features = ["parquet", "lazy", "csv-file", "strings", "temporal", "dtype-duration", "dtype-categorical", "concat_str", "list", "list_eval", "rank", "lazy_regex"]}
Expand Down
5 changes: 4 additions & 1 deletion src/skydive/src/stage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use rust_htslib::bam::ext::BamRecordExtensions;
use std::collections::{HashMap, HashSet};
use std::env;
use std::path::PathBuf;
use std::io::{self, Write};
use linear_map::LinearMap;

// Import the Url type to work with URLs.
Expand All @@ -19,6 +20,7 @@ use gag::Gag;

// Import rayon's parallel iterator traits.
use rayon::prelude::*;
use rayon::iter::{ParallelIterator, IntoParallelRefIterator};

// Import types from rust_htslib for working with BAM files.
use rust_htslib::bam::{ self, Header, IndexedReader, Read };
Expand Down Expand Up @@ -102,6 +104,7 @@ fn stage_data_from_all_files(
loci: &HashSet<(String, u64, u64)>,
cache_path: &PathBuf,
) -> Result<Vec<(Vec<LinearMap<std::string::String, std::string::String>>, Vec<rust_htslib::bam::Record>)>> {

// Use a parallel iterator to process multiple BAM files concurrently.
let all_data: Vec<_> = reads_urls
.par_iter()
Expand Down Expand Up @@ -153,7 +156,7 @@ pub fn stage_data(
loci: &HashSet<(String, u64, u64)>,
reads_urls: &HashSet<Url>,
cache_path: &PathBuf,
require_spanning_reads: bool
require_spanning_reads: bool,
) -> Result<()> {
// Disable stderr from trying to open an IndexedReader a few times, so
// that the Jupyter notebook user doesn't get confused by intermediate
Expand Down
63 changes: 61 additions & 2 deletions src/skydive/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use std::collections::HashSet;
use std::{collections::HashSet, io::BufRead, path::PathBuf};

use anyhow::Result;
use url::Url;

use path_absolutize::Absolutize;

pub fn parse_loci(loci_list: &Vec<String>) -> HashSet<(String, u64, u64)> {
// Initialize a HashSet to store unique loci after parsing
Expand Down Expand Up @@ -58,4 +61,60 @@ pub fn parse_locus(locus: String) -> Result<(String, u64, u64)> {
} else {
anyhow::bail!("Locus format for '{}' is incorrect. It should be 'chr:start[-stop]'.", locus);
}
}
}

pub fn parse_file_names(bam_paths: &Vec<PathBuf>) -> HashSet<Url> {
// Convert the list of BAM file paths into a HashSet of URLs
let mut reads_urls: HashSet<Url> = bam_paths
.iter()
// Use filter_map to attempt to parse each path as a URL, and collect the successful ones
.filter_map(|path| {
let path_str = path.to_string_lossy();
if path_str.starts_with("gs://") {
Url::parse(&path_str).ok()
} else {
Url::from_file_path(path.absolutize().unwrap()).ok()
}
})
.collect();

// If any of the files are a local file ending in .txt, assume it's a file of filenames.
let mut local_file_contents = HashSet::new();
let mut to_remove = HashSet::new();
for url in &reads_urls {
if url.scheme() == "file" {
let path = url.to_file_path().unwrap();
if path.extension().and_then(std::ffi::OsStr::to_str) == Some("txt") {
if let Ok(file) = std::fs::File::open(&path) {
let reader = std::io::BufReader::new(file);
for line in reader.lines() {
if let Ok(line_path) = line {
let abs_path = PathBuf::from(line_path);
local_file_contents.insert(abs_path);
}
}
}

to_remove.insert(url.clone());
}
}
}

// Remove FOFN files from the set of BAM/CRAM files.
to_remove.iter().for_each(|url| { let _ = reads_urls.remove(url); });

// Add the files from the file of filenames to the full list of files.
reads_urls.extend(local_file_contents
.into_iter()
.filter_map(|path| {
let path_str = path.to_string_lossy();
if path_str.starts_with("gs://") {
Url::parse(&path_str).ok()
} else {
Url::from_file_path(path.absolutize().unwrap()).ok()
}
})
);

reads_urls
}

0 comments on commit a059832

Please sign in to comment.