Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

For runs w/multiple projects, manage adapter-trimmed files #126

Merged
merged 2 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sequence_processing_pipeline/FastQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input):
files = [x for x in files if x.endswith('.fastq.gz') and
'zero_files' not in x]

# remove fastq files in the only-adapter-filtered
# folder from consideration if they are present.
files = [x for x in files if 'only-adapter-filtered' not in x]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my concern here is that this folder is at the top of the NuQCJob folder, which potentially means that if there are more than one project with the same filename it could be overwritten; is this possible?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, that's what this PR is fixing. As NuQCJob processes each project, it will move the adapter-trimmed files from only-adapter-filtered into a subdirectory of said folder. This way there will be no overwrites while NuQCJob is running.

This line here is in FastQCJob, which is downstream of NuQCJob. This line is the same line I added to qiita-rc. If the path to _find_projects is a sub-directory of NuQCJob, then this line will have no effect. If however it's passed the path to the entire working directory, this will safely cover that situation.


# break files up into R1, R2, I1, I2
# assume _R1_ does not occur in the path as well.
r1_only = [x for x in files if '_R1_' in x]
Expand Down
29 changes: 29 additions & 0 deletions sequence_processing_pipeline/NuQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,26 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
for fp in files_to_move:
move(fp, dst)

@staticmethod
def _move_trimmed_files(project_name, output_path):
'''
Given output_path, move all fastqs to a new subdir named project_name.
:param project_name: The name of the new folder to be created.
:param output_path: The path to scan for fastq files.
:return: None
'''

if exists(output_path):
pattern = f"{output_path}/*.fastq.gz"

# this directory shouldn't already exist.
makedirs(join(output_path, project_name), exist_ok=False)

for trimmed_file in list(glob.glob(pattern)):
move(trimmed_file, join(output_path, project_name))
else:
raise ValueError(f"'{output_path}' does not exist")

def run(self, callback=None):
# now a single job-script will be created to process all projects at
# the same time, and intelligently handle adapter-trimming as needed
Expand Down Expand Up @@ -244,6 +264,15 @@ def run(self, callback=None):
pattern = f"{source_dir}/*.fastq.gz"
completed_files = list(glob.glob(pattern))

# if the 'only-adapter-filtered' directory exists, move the files
# into a unique location so that files from multiple projects
# don't overwrite each other.
trimmed_only_path = join(self.output_path,
'only-adapter-filtered')

if exists(trimmed_only_path):
NuQCJob._move_trimmed_files(project_name, trimmed_only_path)

if needs_human_filtering is True:
filtered_directory = join(source_dir, 'filtered_sequences')
else:
Expand Down
63 changes: 62 additions & 1 deletion sequence_processing_pipeline/tests/test_NuQCJob.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import shutil
import unittest
from os.path import join, abspath, exists
from os.path import join, abspath, exists, dirname
from functools import partial
from sequence_processing_pipeline.NuQCJob import NuQCJob
from sequence_processing_pipeline.PipelineError import PipelineError
from os import makedirs, remove
from metapool import load_sample_sheet
import glob


class TestNuQCJob(unittest.TestCase):
Expand Down Expand Up @@ -546,6 +547,10 @@ def tearDown(self):
if exists(self.tmp_file_path):
remove(self.tmp_file_path)

# for test_move_trimmed_files()
if exists(self.path('NuQCJob')):
shutil.rmtree(self.path('NuQCJob'))

def test_nuqcjob_creation(self):
# use good-sample-sheet as the basis for a sample Metatranscriptomic
with self.assertRaises(PipelineError) as e:
Expand Down Expand Up @@ -1204,6 +1209,41 @@ def test_regular_expressions(self):

self._helper(job.json_regex, good_names, bad_names)

def test_move_trimmed(self):
# Note: this test does not make use of the output_dir that other
# tests use.

for dummy_fp in SAMPLE_DIR:
dummy_fp = self.path(dummy_fp)
dummy_path = dirname(dummy_fp)
makedirs(dummy_path, exist_ok=True)
with open(dummy_fp, 'w') as f:
f.write("This is a dummy file.\n")

trimmed_only_path = self.path('NuQCJob', 'only-adapter-filtered')

NuQCJob._move_trimmed_files('NPH_15288', trimmed_only_path)

new_path = join(trimmed_only_path, 'NPH_15288')
pattern = f"{new_path}/*.fastq.gz"

exp = [
('only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001.'
'fastq.gz'),
('only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001.'
'fastq.gz')]

for trimmed_file in list(glob.glob(pattern)):
trimmed_file = trimmed_file.split('NuQCJob/')[-1]
if trimmed_file not in exp:
self.assertIn(trimmed_file, exp)

def _helper(self, regex, good_names, bad_names):
for good_name in good_names:
substr = regex.search(good_name)
Expand All @@ -1214,5 +1254,26 @@ def _helper(self, regex, good_names, bad_names):
self.assertIsNone(substr, msg=f'Regex failed on {bad_name}')


SAMPLE_DIR = [
'NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz',
'NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json',
'NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json',
'NuQCJob/process_all_fastq_files.sh',
'NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed',
'NuQCJob/logs/slurm-1897981_1.out',
'NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1']

if __name__ == '__main__':
unittest.main()
Loading