Skip to content

Commit

Permalink
minor bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Sep 21, 2024
1 parent 5744f93 commit 8a069f3
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 7 deletions.
1 change: 1 addition & 0 deletions src/uniprot2s3/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@
PROTEOMES_PROTEOME_ID_COLUMNNAME = "Proteome Id"
PROTEOMES_ORGANISM_ID_COLUMNNAME = "Organism Id"
KGMICROBE_PROTEOMES_FILENAME = "Proteomes_KGMicrobe_Subset"
CHUNK_SIZE_PER_WORKER = 1000
17 changes: 10 additions & 7 deletions src/uniprot2s3/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from tqdm.contrib.concurrent import process_map

from .constants import (
CHUNK_SIZE_PER_WORKER,
KGMICROBE_PROTEOMES_FILENAME,
NCBITAXON_PREFIX,
ORGANISM_ID_MIXED_CASE,
Expand Down Expand Up @@ -44,7 +45,6 @@
UNIPROT_S3_DIR.mkdir(parents=True, exist_ok=True)



# Function to read organisms from a CSV file and return a set
def _read_organisms_from_csv(file_path):
with open(file_path, newline="") as csvfile:
Expand Down Expand Up @@ -190,10 +190,8 @@ def fetch_uniprot_reference_proteome_data() -> list:
file_path = Path(RAW_DATA_DIR) / f"{PROTEOMES_FILENAME}.{UNIPROT_DESIRED_FORMAT}"
# all_proteomes_query = "%28*%29"
filtered_proteomes_query = (
"(*)+AND+((superkingdom:Bacteria)+OR+(superkingdom:Archaea))"
"+AND+((proteome_type:1)+OR+(proteome_type:2))"
"(*)+AND+((superkingdom:Bacteria)+OR+(superkingdom:Archaea))" "+AND+((proteome_type:1)+OR+(proteome_type:2))"
)


url = construct_query_url(
UNIPROT_REFERENCE_PROTEOMES_URL,
Expand All @@ -219,12 +217,15 @@ def fetch_uniprot_reference_proteome_data() -> list:
# Write response to file if it contains data
if len(response.text.strip().split("\n")) > 1:
with open(file_path, "a") as file:
file.write(response.text)
file.write(response.text) if PROTEOMES_ORGANISM_ID_COLUMNNAME not in response.text else None

# Read file to df for sorting
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df = df.drop_duplicates()
df = df.sort_values(
by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME], axis=0, ascending=True
by=[PROTEOMES_ORGANISM_ID_COLUMNNAME, PROTEOMES_PROTEOME_ID_COLUMNNAME],
axis=0,
ascending=True,
)
df.to_csv(file_path, sep="\t", index=False)

Expand Down Expand Up @@ -313,7 +314,9 @@ def run_uniprot_api_parallel(
fetch_func = partial(fetch_uniprot_data)
# If show_status is True, use process_map to display a progress bar
if show_status:
process_map(fetch_func, taxa_id_common_with_proteomes_list, max_workers=workers, chunksize=999)
process_map(
fetch_func, taxa_id_common_with_proteomes_list, max_workers=workers, chunksize=CHUNK_SIZE_PER_WORKER
)
else:
# Set up a pool of worker processes without a progress bar
with multiprocessing.Pool(processes=workers) as pool:
Expand Down

0 comments on commit 8a069f3

Please sign in to comment.