-
Notifications
You must be signed in to change notification settings - Fork 0
/
FastQDumpConcurrency.py
63 lines (51 loc) · 1.8 KB
/
FastQDumpConcurrency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#! /usr/bin/python
"""A script for downloading fastq files concurrently.
This script aims to download fastq files concurrently. The SRA accession numbers are stored in different
csv files. The names of the csv files are the BioExperiment IDs.
"""
import os
from multiprocessing import Pool
# Path to the directory in which the fastq files are stored
DATA_PATH = "/home/mali/NewDrive/Project/FSL/Data"
def read_file(csv_path):
"""Reads in a csv file.
Parameters
----------
csv_path : str
A str refers to a path to the csv file.
Returns
-------
List
A list contains the SRA accession numbers
"""
with open(csv_path, "r") as file:
tmp_list = file.read().replace("\n", "").split(",")
return tmp_list
def dump_fastq(sra_number):
"""The function prefetches and dumps a sra file with the given sra_number
Parameters
----------
sra_number : str
A string refers to a SRA accession number.
"""
# mkdir_cmd = f"mkdir {sra_path}"
# os.system(mkdir_cmd)
prefetch_cmd = f"prefetch {sra_number} -O {DATA_PATH}"
os.system(prefetch_cmd)
os.system(f"echo {sra_number} prefetched")
sra_path = os.path.join(DATA_PATH, sra_number)
fasterq_dump_cmd = f"fasterq-dump {sra_path} -O {sra_path}"
os.system(fasterq_dump_cmd)
fastq_path = os.path.join(sra_path, "*.fastq")
gzip_cmd = f"gzip {fastq_path}"
os.system(gzip_cmd)
# Remove the SRA file
SRA_path = os.path.join(DATA_PATH, sra_number, f"{sra_number}.sra")
SRA_remove_cmd = f"rm {SRA_path}"
os.system(SRA_remove_cmd)
# sra_list = read_file("PRJEB40875.csv")
# sra_list = read_file("PRJEB28329.csv")
sra_list = read_file("PRJNA494975.csv")
print(f"{len(sra_list)} files are going to dump.")
with Pool(3) as pool:
pool.map(dump_fastq, sra_list)