Skip to content

Commit

Permalink
updated HCMI to include more metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
sgosline committed May 22, 2024
1 parent 1cc9d0b commit 76c2b58
Show file tree
Hide file tree
Showing 4 changed files with 2,252 additions and 1,343 deletions.
62 changes: 44 additions & 18 deletions build/hcmi/01-createHCMISamplesFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
import argparse
import numpy as np





def align_to_linkml_schema(input_df):
"""
Maps the 'model_type' column of the input DataFrame to a set of predefined categories
Expand Down Expand Up @@ -123,11 +127,18 @@ def fetch_metadata_for_samples(uuids):
"fields": (
"cases.sample_ids,"
"cases.case_id,"
"cases.submitter_id,"
"cases.annotations.case_submitter_id,"
"cases.samples.sample_id,"
"cases.samples.portions.analytes.aliquots.aliquot_id,"
"cases.samples.sample_type,"
"cases.diagnoses.submitter_id,"
"cases.diagnoses.diagnosis_id,"
"cases.diagnoses.classification_of_tumor,"
"cases.diagnoses.tissue_or_organ_of_origin,"
"cases.diagnoses.primary_diagnosis,"
"cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now
"cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now
"cases.samples.tumor_descriptor,"
"cases.samples.composition"
),
Expand Down Expand Up @@ -158,59 +169,73 @@ def extract_data(data):
for idx, sample in enumerate(case['samples']):
for portion in sample['portions']:
for analyte in portion['analytes']:

for aliquot in analyte['aliquots']:
if idx < len(case['diagnoses']):
diagnosis = case['diagnoses'][idx]
extracted.append({
'id': hit['id'],
'case_id': case['case_id'],
'entry_id': hit['id'],
'case_uuid': case['case_id'],
'case_id': case['submitter_id'],
'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'],
'primary_diagnosis': diagnosis['primary_diagnosis'],
'diagnosis_id':diagnosis['submitter_id'],
'tumor_classification':diagnosis['classification_of_tumor'],
'sample_id': sample['sample_id'],
'sample_type': sample['sample_type'],
'tumor_descriptor': sample.get('tumor_descriptor', None),
#'tumor_descriptor': sample.get('tumor_descriptor', None),
'composition': sample.get('composition', None),
'aliquot_id': aliquot['aliquot_id']
'id': aliquot['aliquot_id']
})
return pd.DataFrame(extracted)

def filter_and_subset_data(df,sampfile):
def filter_and_subset_data(df,sampfile,mapfile):
"""
Filter and subset the data.
Taking a pandas dataframe containing all sample information, filter it to desired columns and rename them to match schema.
Parameters
----------
df : pandas dataframe
df : a tidied pandas dataframe
full samples table
Returns
-------
Pandas Dataframe
"""
duplicates_mask = df.drop('id', axis=1).duplicated(keep='first')
cmap = pd.read_csv(mapfile, encoding='ISO-8859-1')
filt = df[~duplicates_mask]
filt= filt.drop_duplicates(subset='aliquot_id', keep=False)
filt= filt.drop_duplicates()#(subset='id', keep=False)
filt = pd.merge(filt,cmap,right_on=['tissue_or_organ_of_origin','primary_diagnosis'],left_on=['tissue_or_organ_of_origin','primary_diagnosis'],how='left')
filt = filt.rename(
columns={"tissue_or_organ_of_origin":"common_name",
"primary_diagnosis": "cancer_type",
"composition": "model_type",
"case_id": "other_names",
"aliquot_id": "other_id"}
columns={"composition": "model_type",
"case_id": "common_name",
"id": "other_names"}
#"id": "sample_uuid"}
)
filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
filt["other_id_source"] = "HCMI"
##now we can melt all the identiers into other_id and other_id_source
longtab = pd.melt(filt, id_vars=['common_name','other_names','model_type','cancer_type'], value_vars=['diagnosis_id','tumor_classification','sample_type'])
longtab = longtab.rename(columns={'variable':'other_id_source','value':'other_id'}).drop_duplicates()
# filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
# filt["other_id_source"] = "HCMI"
# Create new improve sample IDs

#Non-docker:
# maxval = max(pd.read_csv('../cptac/cptac_samples.csv').improve_sample_id)
# Docker:
maxval = max(pd.read_csv(sampfile).improve_sample_id)
mapping = {other_id: i for i, other_id in enumerate(filt['other_id'].unique(), start=(int(maxval)+1))}
alluuids = list(set(longtab.other_names))

mapping = pd.DataFrame.from_dict(
{"other_names": [str(a) for a in alluuids],
"improve_sample_id": range(int(maxval)+1,int(maxval)+len(alluuids)+1)
})
longtab = pd.merge(longtab,mapping,on='other_names',how='left')
# Use the map method to create the new column based on the lab-id column
filt['improve_sample_id'] = filt['other_id'].map(mapping)
return filt
#['improve_sample_id'] = longtab['other_id'].map(mapping)
return longtab


def main():
Expand Down Expand Up @@ -240,14 +265,15 @@ def main():
"""
parser = argparse.ArgumentParser()
parser.add_argument('--samples',dest='samps',help='Previous sample file')
parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='hcmi_cancer_types.csv')
args = parser.parse_args()
manifest_path = "full_manifest.txt"
#manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/hcmi/full_manifest.txt"
#download_from_github(manifest_url, manifest_path)
uuids = extract_uuids_from_manifest(manifest_path)
metadata = fetch_metadata_for_samples(uuids)
df = extract_data(metadata)
output = filter_and_subset_data(df,args.samps)
output = filter_and_subset_data(df,args.samps,args.map)
aligned = align_to_linkml_schema(output)
print(aligned)
aligned.to_csv("/tmp/hcmi_samples.csv",index=False)
Expand Down
6 changes: 3 additions & 3 deletions build/hcmi/02-getHCMIData.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
"""
# samples_path = "/tmp/hcmi_samples.csv"
samples = pl.read_csv(samples_path)
samples = samples.drop(["cancer_type", "common_name", "other_names", "model_type", "other_id_source"])
samples = samples.drop(["cancer_type", "common_name", "other_id", "model_type", "species","other_id_source"]).unique()

# Determine columns to select based on data_type
columns = {
Expand All @@ -448,8 +448,8 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
chunk = chunk.rename({"Variant_Classification": "variant_classification"})
chunk = chunk.select(selected_columns)

merged_chunk = samples.join(chunk, left_on='other_id', right_on='aliquot_id', how='inner')
merged_chunk = merged_chunk.drop(["aliquot_id", "other_id"])
merged_chunk = samples.join(chunk, left_on='other_names', right_on='aliquot_id', how='inner')
merged_chunk = merged_chunk.drop(["aliquot_id", "other_names"])

# Append the processed chunk
merged_data = pl.concat([merged_data, merged_chunk])
Expand Down
Loading

0 comments on commit 76c2b58

Please sign in to comment.