PNNL-CompBio · sgosline · May 22, 2024 · May 22, 2024
diff --git a/build/hcmi/01-createHCMISamplesFile.py b/build/hcmi/01-createHCMISamplesFile.py
@@ -4,6 +4,10 @@
 import argparse
 import numpy as np
 
+
+
+
+
 def align_to_linkml_schema(input_df):
     """
     Maps the 'model_type' column of the input DataFrame to a set of predefined categories 
@@ -123,11 +127,18 @@ def fetch_metadata_for_samples(uuids):
         "fields": (
             "cases.sample_ids,"
             "cases.case_id,"
+            "cases.submitter_id,"
+            "cases.annotations.case_submitter_id,"
             "cases.samples.sample_id,"
             "cases.samples.portions.analytes.aliquots.aliquot_id,"
             "cases.samples.sample_type,"
+            "cases.diagnoses.submitter_id,"
+            "cases.diagnoses.diagnosis_id,"
+            "cases.diagnoses.classification_of_tumor,"
             "cases.diagnoses.tissue_or_organ_of_origin,"
             "cases.diagnoses.primary_diagnosis,"
+            "cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now
+            "cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now
             "cases.samples.tumor_descriptor,"
             "cases.samples.composition"
         ),
@@ -158,59 +169,73 @@ def extract_data(data):
             for idx, sample in enumerate(case['samples']):
                 for portion in sample['portions']:
                     for analyte in portion['analytes']:
+
                         for aliquot in analyte['aliquots']:
                             if idx < len(case['diagnoses']):
                                 diagnosis = case['diagnoses'][idx]
                                 extracted.append({
-                                    'id': hit['id'],
-                                    'case_id': case['case_id'],
+                                    'entry_id': hit['id'],
+                                    'case_uuid': case['case_id'],
+                                    'case_id': case['submitter_id'],
                                     'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'],
                                     'primary_diagnosis': diagnosis['primary_diagnosis'],
+                                    'diagnosis_id':diagnosis['submitter_id'],
+                                    'tumor_classification':diagnosis['classification_of_tumor'],
                                     'sample_id': sample['sample_id'],
                                     'sample_type': sample['sample_type'],
-                                    'tumor_descriptor': sample.get('tumor_descriptor', None),
+                                    #'tumor_descriptor': sample.get('tumor_descriptor', None),
                                     'composition': sample.get('composition', None),
-                                    'aliquot_id': aliquot['aliquot_id']
+                                    'id': aliquot['aliquot_id']
                                 })
     return pd.DataFrame(extracted)
 
-def filter_and_subset_data(df,sampfile):
+def filter_and_subset_data(df,sampfile,mapfile):
     """
     Filter and subset the data.
 
     Taking a pandas dataframe containing all sample information, filter it to desired columns and rename them to match schema.
 
     Parameters
     ----------
-    df : pandas dataframe
+    df : a tidied pandas dataframe
         full samples table
 
     Returns
     -------
     Pandas Dataframe
     """
     duplicates_mask = df.drop('id', axis=1).duplicated(keep='first')
+    cmap = pd.read_csv(mapfile, encoding='ISO-8859-1')
     filt = df[~duplicates_mask]
-    filt= filt.drop_duplicates(subset='aliquot_id', keep=False)
+    filt= filt.drop_duplicates()#(subset='id', keep=False)
+    filt = pd.merge(filt,cmap,right_on=['tissue_or_organ_of_origin','primary_diagnosis'],left_on=['tissue_or_organ_of_origin','primary_diagnosis'],how='left')
     filt = filt.rename(
-        columns={"tissue_or_organ_of_origin":"common_name",
-                 "primary_diagnosis": "cancer_type",
-                 "composition": "model_type",
-                 "case_id": "other_names",
-                 "aliquot_id": "other_id"}
+        columns={"composition": "model_type",
+                 "case_id": "common_name",
+                 "id": "other_names"}
+                 #"id": "sample_uuid"}
             )
-    filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
-    filt["other_id_source"] = "HCMI"
+    ##now we can melt all the identiers into other_id and other_id_source
+    longtab = pd.melt(filt, id_vars=['common_name','other_names','model_type','cancer_type'], value_vars=['diagnosis_id','tumor_classification','sample_type'])
+    longtab = longtab.rename(columns={'variable':'other_id_source','value':'other_id'}).drop_duplicates()
+    #    filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
+#    filt["other_id_source"] = "HCMI"
     # Create new improve sample IDs
 
     #Non-docker:
     # maxval = max(pd.read_csv('../cptac/cptac_samples.csv').improve_sample_id)
     # Docker:
     maxval = max(pd.read_csv(sampfile).improve_sample_id)
-    mapping = {other_id: i for i, other_id in enumerate(filt['other_id'].unique(), start=(int(maxval)+1))}
+    alluuids = list(set(longtab.other_names))
+
+    mapping = pd.DataFrame.from_dict(
+         {"other_names": [str(a) for a in alluuids],
+          "improve_sample_id": range(int(maxval)+1,int(maxval)+len(alluuids)+1)
+          })
+    longtab = pd.merge(longtab,mapping,on='other_names',how='left')
     # Use the map method to create the new column based on the lab-id column
-    filt['improve_sample_id'] = filt['other_id'].map(mapping)
-    return filt
+    #['improve_sample_id'] = longtab['other_id'].map(mapping)
+    return longtab
 
 
 def main():
@@ -240,14 +265,15 @@ def main():
     """
     parser = argparse.ArgumentParser()
     parser.add_argument('--samples',dest='samps',help='Previous sample file')
+    parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='hcmi_cancer_types.csv')
     args = parser.parse_args()
     manifest_path = "full_manifest.txt"
     #manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/hcmi/full_manifest.txt"
     #download_from_github(manifest_url, manifest_path)
     uuids = extract_uuids_from_manifest(manifest_path)
     metadata = fetch_metadata_for_samples(uuids)
     df = extract_data(metadata)
-    output = filter_and_subset_data(df,args.samps)
+    output = filter_and_subset_data(df,args.samps,args.map)
     aligned = align_to_linkml_schema(output)
     print(aligned)
     aligned.to_csv("/tmp/hcmi_samples.csv",index=False)

diff --git a/build/hcmi/02-getHCMIData.py b/build/hcmi/02-getHCMIData.py
@@ -430,7 +430,7 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
     """
 #    samples_path = "/tmp/hcmi_samples.csv"
     samples = pl.read_csv(samples_path)
-    samples = samples.drop(["cancer_type", "common_name", "other_names", "model_type", "other_id_source"])
+    samples = samples.drop(["cancer_type", "common_name", "other_id", "model_type", "species","other_id_source"]).unique()
 
     # Determine columns to select based on data_type
     columns = {
@@ -448,8 +448,8 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
             chunk = chunk.rename({"Variant_Classification": "variant_classification"})
         chunk = chunk.select(selected_columns)
 
-        merged_chunk = samples.join(chunk, left_on='other_id', right_on='aliquot_id', how='inner')
-        merged_chunk = merged_chunk.drop(["aliquot_id", "other_id"])
+        merged_chunk = samples.join(chunk, left_on='other_names', right_on='aliquot_id', how='inner')
+        merged_chunk = merged_chunk.drop(["aliquot_id", "other_names"])
 
         # Append the processed chunk
         merged_data = pl.concat([merged_data, merged_chunk])