Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated HCMI to include more metadata #187

Merged
merged 1 commit into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 44 additions & 18 deletions build/hcmi/01-createHCMISamplesFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
import argparse
import numpy as np





def align_to_linkml_schema(input_df):
"""
Maps the 'model_type' column of the input DataFrame to a set of predefined categories
Expand Down Expand Up @@ -123,11 +127,18 @@ def fetch_metadata_for_samples(uuids):
"fields": (
"cases.sample_ids,"
"cases.case_id,"
"cases.submitter_id,"
"cases.annotations.case_submitter_id,"
"cases.samples.sample_id,"
"cases.samples.portions.analytes.aliquots.aliquot_id,"
"cases.samples.sample_type,"
"cases.diagnoses.submitter_id,"
"cases.diagnoses.diagnosis_id,"
"cases.diagnoses.classification_of_tumor,"
"cases.diagnoses.tissue_or_organ_of_origin,"
"cases.diagnoses.primary_diagnosis,"
"cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now
"cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now
"cases.samples.tumor_descriptor,"
"cases.samples.composition"
),
Expand Down Expand Up @@ -158,59 +169,73 @@ def extract_data(data):
for idx, sample in enumerate(case['samples']):
for portion in sample['portions']:
for analyte in portion['analytes']:

for aliquot in analyte['aliquots']:
if idx < len(case['diagnoses']):
diagnosis = case['diagnoses'][idx]
extracted.append({
'id': hit['id'],
'case_id': case['case_id'],
'entry_id': hit['id'],
'case_uuid': case['case_id'],
'case_id': case['submitter_id'],
'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'],
'primary_diagnosis': diagnosis['primary_diagnosis'],
'diagnosis_id':diagnosis['submitter_id'],
'tumor_classification':diagnosis['classification_of_tumor'],
'sample_id': sample['sample_id'],
'sample_type': sample['sample_type'],
'tumor_descriptor': sample.get('tumor_descriptor', None),
#'tumor_descriptor': sample.get('tumor_descriptor', None),
'composition': sample.get('composition', None),
'aliquot_id': aliquot['aliquot_id']
'id': aliquot['aliquot_id']
})
return pd.DataFrame(extracted)

def filter_and_subset_data(df,sampfile):
def filter_and_subset_data(df,sampfile,mapfile):
"""
Filter and subset the data.

Taking a pandas dataframe containing all sample information, filter it to desired columns and rename them to match schema.

Parameters
----------
df : pandas dataframe
df : a tidied pandas dataframe
full samples table

Returns
-------
Pandas Dataframe
"""
duplicates_mask = df.drop('id', axis=1).duplicated(keep='first')
cmap = pd.read_csv(mapfile, encoding='ISO-8859-1')
filt = df[~duplicates_mask]
filt= filt.drop_duplicates(subset='aliquot_id', keep=False)
filt= filt.drop_duplicates()#(subset='id', keep=False)
filt = pd.merge(filt,cmap,right_on=['tissue_or_organ_of_origin','primary_diagnosis'],left_on=['tissue_or_organ_of_origin','primary_diagnosis'],how='left')
filt = filt.rename(
columns={"tissue_or_organ_of_origin":"common_name",
"primary_diagnosis": "cancer_type",
"composition": "model_type",
"case_id": "other_names",
"aliquot_id": "other_id"}
columns={"composition": "model_type",
"case_id": "common_name",
"id": "other_names"}
#"id": "sample_uuid"}
)
filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
filt["other_id_source"] = "HCMI"
##now we can melt all the identiers into other_id and other_id_source
longtab = pd.melt(filt, id_vars=['common_name','other_names','model_type','cancer_type'], value_vars=['diagnosis_id','tumor_classification','sample_type'])
longtab = longtab.rename(columns={'variable':'other_id_source','value':'other_id'}).drop_duplicates()
# filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
# filt["other_id_source"] = "HCMI"
# Create new improve sample IDs

#Non-docker:
# maxval = max(pd.read_csv('../cptac/cptac_samples.csv').improve_sample_id)
# Docker:
maxval = max(pd.read_csv(sampfile).improve_sample_id)
mapping = {other_id: i for i, other_id in enumerate(filt['other_id'].unique(), start=(int(maxval)+1))}
alluuids = list(set(longtab.other_names))

mapping = pd.DataFrame.from_dict(
{"other_names": [str(a) for a in alluuids],
"improve_sample_id": range(int(maxval)+1,int(maxval)+len(alluuids)+1)
})
longtab = pd.merge(longtab,mapping,on='other_names',how='left')
# Use the map method to create the new column based on the lab-id column
filt['improve_sample_id'] = filt['other_id'].map(mapping)
return filt
#['improve_sample_id'] = longtab['other_id'].map(mapping)
return longtab


def main():
Expand Down Expand Up @@ -240,14 +265,15 @@ def main():
"""
parser = argparse.ArgumentParser()
parser.add_argument('--samples',dest='samps',help='Previous sample file')
parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='hcmi_cancer_types.csv')
args = parser.parse_args()
manifest_path = "full_manifest.txt"
#manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/hcmi/full_manifest.txt"
#download_from_github(manifest_url, manifest_path)
uuids = extract_uuids_from_manifest(manifest_path)
metadata = fetch_metadata_for_samples(uuids)
df = extract_data(metadata)
output = filter_and_subset_data(df,args.samps)
output = filter_and_subset_data(df,args.samps,args.map)
aligned = align_to_linkml_schema(output)
print(aligned)
aligned.to_csv("/tmp/hcmi_samples.csv",index=False)
Expand Down
6 changes: 3 additions & 3 deletions build/hcmi/02-getHCMIData.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
"""
# samples_path = "/tmp/hcmi_samples.csv"
samples = pl.read_csv(samples_path)
samples = samples.drop(["cancer_type", "common_name", "other_names", "model_type", "other_id_source"])
samples = samples.drop(["cancer_type", "common_name", "other_id", "model_type", "species","other_id_source"]).unique()

# Determine columns to select based on data_type
columns = {
Expand All @@ -448,8 +448,8 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
chunk = chunk.rename({"Variant_Classification": "variant_classification"})
chunk = chunk.select(selected_columns)

merged_chunk = samples.join(chunk, left_on='other_id', right_on='aliquot_id', how='inner')
merged_chunk = merged_chunk.drop(["aliquot_id", "other_id"])
merged_chunk = samples.join(chunk, left_on='other_names', right_on='aliquot_id', how='inner')
merged_chunk = merged_chunk.drop(["aliquot_id", "other_names"])

# Append the processed chunk
merged_data = pl.concat([merged_data, merged_chunk])
Expand Down
Loading