SIMEXP · htwangtw · May 23, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/code/ukbb.py b/code/ukbb.py
@@ -1,24 +1,50 @@
 """Load uk biobank data and extract demographic information.
 
-Author: Hao-Ting Wang; last edit 2024-02-02
+Author: Hao-Ting Wang; last edit 2024-07-11
 
 All input stored in `data/ukbb` folder. The content of `data` is not
 included in the repository.
 
 Labels are cureated from the ukbb website and saved to `data/ukbb`.
 This is public information hence it is not included in the repository.
+You will need to download coding9, coding10, and coding19.
+For example, coding9 can be downloaded from:
+https://biobank.ndph.ox.ac.uk/ukb/coding.cgi?id=9
 
 data_file is provided by SJ's lab and requires DUA,
 hence it is not included in the repository.
 
 To expand this tool, add more  information to `info` with information
 from Data_fields xlsx file (provided by SJ's lab and requires DUA).
 
+The output is saved in the output directory as `ukbb_pheno.tsv` and
+`ukbb_pheno.json`. The json file contains the meta information for each
+field in the tsv file.
+
+The script also takes a qc file as input. The qc file is generated by
+giga_auto_qc and contains the qc information for each participant.
+The qc file is used to filter out participants that did not pass qc.
+https://github.com/SIMEXP/giga_auto_qc
+
+The script also takes a list of confound columns as input. The confound
+columns are used to balance the sample for each diagnosis group. The
+sample is balanced using the general_class_balancer:
+https://github.com/mleming/general_class_balancer
+Packaged version:
+https://github.com/SIMEXP/general_class_balancer
+This is the same procedure as AH's multi-site analysis:
+https://github.com/harveyaa/neuropsych_mtl/blob/master/conf_balancing
+
+The script also outputs a json file with the balanced sample for each
+diagnosis group. The json file is saved as `downstream_balanced_sample.json`.
 """
 from pathlib import Path
 import pandas as pd
 import json
 import argparse
+import seaborn as sns
+import matplotlib.pyplot as plt
+from general_class_balancer import general_class_balancer as gcb
 
 # phenotypic information curated through ukbb website
 info = {
@@ -45,9 +71,135 @@
                 'description': 'first repeat imaging visit',
             }
         }
+    },
+    'diagnosis':{
+        'id': 41270,
+        'description': 'Diagnoses - International Classification of Disease version 10 (ICD10)',
+        'labels': 'coding19.tsv',  # download from ukbb website
+        'instance':{
+            'F00': {
+                'label': 'ADD',
+                'description': 'Alzheimer disease - Dementia',
+            },
+            'F10': {
+                'label': 'ALCO',
+                'description': 'Alcohol Abuse',
+            },
+            'F20': {
+                'label': 'SCZ',
+                'description': 'Schizophrenia',
+            },
+            'F31': {
+                'label': 'BIPOLAR',
+                'description': 'Bipolar disorder',
+            },
+            'F32': {
+                'label': 'DEP',
+                'description': 'Depressive disorder',
+            },
+            'G20': {
+                'label': 'PARK',
+                'description': 'Parkinson',
+            },
+            'G30': {
+                'label': 'ADD',
+                'description': 'Alzheimer disease - Dementia',
+            },
+            'G35': {
+                'label': 'MS',
+                'description': 'Multiple sclerosis',
+            },
+            'G40': {
+                'label': 'EPIL',
+                'description': 'Epilepsy',
+            },
+        }
+    },
+    'per-group':{
+        'description': 'Mathcing control vs disease per group.',
     }
 }
 
+def read_ukbb_diagnosis_per_group(data_file):
+    # read data_file with subject id and all columns started with f.41270
+    data = pd.read_csv(data_file, sep='\t', na_values='NA', index_col=0, low_memory=False)
+    data = data.filter(regex='^f.41270', axis=1)
+    meta_data = {}
+    for idx, row in data.iterrows():
+        row = row.dropna()
+        # reduce the value to the first three characters
+        row = row.apply(lambda x: x[:3])
+        row = row.tolist()
+        for diagnosis_code in info['diagnosis']['instance']:
+            diagnosis_label = info['diagnosis']['instance'][diagnosis_code]['label']
+            data.loc[idx, diagnosis_label] = 0 
+            if diagnosis_code in row:
+                data.loc[idx, diagnosis_label] = 1
+            if diagnosis_label not in meta_data:
+                meta_data[diagnosis_label] = {
+                    'fid': diagnosis_label,
+                    'description': diagnosis_label,
+                    'instance':{ 
+                        0: {
+                            'name': 'control',
+                            'description': 'Healthy control, matched sex, age, and site with the disease group by general class balancer.',
+                        },
+                        1: {
+                            'name': diagnosis_label,
+                            'description': info['diagnosis']['instance'][diagnosis_code]['description'],
+                        }
+                    }
+                }
+    diagnosis_groups = [k for k in meta_data.keys() if "_sample" not in k]
+    return data[diagnosis_groups], meta_data
+
+
+def read_ukbb_general_diagnosis(data_file):
+    # read data_file with subject id and all columns started with f.41270
+    data = pd.read_csv(data_file, sep='\t', na_values='NA', index_col=0, low_memory=False)
+    data = data.filter(regex='^f.41270', axis=1)
+    for idx, row in data.iterrows():
+        row = row.dropna()
+        # reduce the value to the first three characters
+        row = row.apply(lambda x: x[:3])
+        row = row.tolist()
+        if len(row) > 0:
+            # assign as the default as control
+            data.loc[idx, 'icd10'] = row[0]
+            data.loc[idx, 'n_icd10'] = len(row)
+            data.loc[idx, 'diagnosis'] = 'HC'
+            # take the first value that matches diagnosis of interest
+            while row:
+                label = row.pop(0)
+                if label in info['diagnosis']['instance']:
+                    data.loc[idx, 'diagnosis'] = info['diagnosis']['instance'][label]['label']
+                    data.loc[idx, 'icd10'] = label
+                    break
+        else:
+            # no history of diagnosis at all wow
+            data.loc[idx, 'n_icd10'] = 0
+            data.loc[idx, 'icd10'] = None
+            data.loc[idx, 'diagnosis'] = 'HC'
+
+    meta_data = {
+        'diagnosis': {
+            'fid': 'f.41270.x',
+            'description': info['diagnosis']['description'],
+            'labels': {  # curated from ukbb website
+                'ADD': 'Alzheimer disease - Dementia',
+                'ALCO': 'Alcohol Abuse',
+                'DEP': 'Depressive disorder',
+                'SCZ': 'Schizophrenia',
+                'BIPOLAR': 'Bipolar disorder',
+                'PARK': 'Parkinson',
+                'MS': 'Multiple sclerosis',
+                'EPIL': 'Epilepsy',
+                'HC': 'Healthy control',
+            }
+        }
+    }
+    return data[['diagnosis', 'icd10', 'n_icd10']], meta_data
+
 
 def read_ukbb_data(data_file, info_label):
 
@@ -57,6 +209,11 @@ def read_ukbb_data(data_file, info_label):
             f'{list(info.keys())}'
         )
 
+    # diagnostic data has to be handled differently
+    if info_label == 'diagnosis':
+        return read_ukbb_general_diagnosis(data_file)
+    if info_label == 'per-group':
+        return read_ukbb_diagnosis_per_group(data_file)
     # compile meta data and construct id in the datafile (fid)
     # format: f.<id>.<instance>.0
     # not sure the field for 0 is for, but it is always 0
@@ -111,11 +268,14 @@ def read_ukbb_data(data_file, info_label):
     # use argparse to get datafile
     parser = argparse.ArgumentParser(description='Extract ukbb data')
     parser.add_argument('datafile', type=Path, help='ukbb data file')
+    parser.add_argument('qcfile', type=Path, help='giga auto qc output file')
     parser.add_argument('output', type=Path, help='output directory')
+    parser.add_argument('--confounds', type=str, nargs="+", help='confound columns', default=list())
 
     args = parser.parse_args()
     data_file = args.datafile
     output_dir = args.output
+    confounds = args.confounds
 
     curated_data = []
     curated_meta = {}
@@ -124,7 +284,37 @@ def read_ukbb_data(data_file, info_label):
         curated_data.append(data)
         curated_meta.update(meta_data)
     curated_data = pd.concat(curated_data, axis=1)
+    curated_data.index.name = 'participant_id'
+    curated_data = curated_data[~curated_data.index.duplicated(keep='first')]
 
-    curated_data.to_csv(output_dir / 'ukbb_pheno.tsv', sep='\t')
+    # add qc information
+    qc = pd.read_csv(args.qcfile, sep='\t', index_col='participant_id')[['mean_fd_raw', 'proportion_kept', 'pass_all_qc']]
+    qc = qc[~qc.index.duplicated(keep='first')]
+    curated_data = pd.concat([qc, curated_data], axis=1, join='inner')
+    passed_qc = curated_data[curated_data['pass_all_qc'] == True]
+
+    # save the data
+    passed_qc.to_csv(output_dir / 'ukbb_pheno.tsv', sep='\t')
     with open(output_dir / 'ukbb_pheno.json', 'w') as f:
         json.dump(curated_meta, f, indent=2)
+
+    # create a matched sample for each diagnosis group
+    if len(confounds) > 0:
+        downstreams = {}
+        diagnosis_groups = list(curated_meta['diagnosis']['labels'].keys())
+        diagnosis_groups.remove('HC')
+        for d in diagnosis_groups:
+            select_sample = gcb.class_balance(passed_qc[d].values.astype(int), passed_qc[confounds].values.T)
+            downstreams[d] = passed_qc.index[select_sample].tolist()
+        with open(output_dir / 'downstream_balanced_sample.json', 'w') as f:
+            json.dump(downstreams, f, indent=2)
+
+        # plot the distribution of confounds
+        for d in diagnosis_groups:
+            subjects = downstreams[d]
+            df = passed_qc.loc[subjects, :]
+            fig, axes = plt.subplots(1, 4, figsize=(20, 5))
+            fig.suptitle(f"Confound blanced sample (N={len(subjects)}): " + curated_meta[d]['instance'][1]['description'])
+            for ax, c in zip(axes, ['sex', 'age', 'mean_fd_raw', 'proportion_kept']):
+                sns.histplot(x=c, data=df, hue=d, kde=True, ax=ax)
+            fig.savefig(output_dir / f'distribution_balanced_sample_{d}.png')
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,4 @@
+numpy
 pandas[excel]
+seaborn
+general_class_balancer @ git+https://github.com/SIMEXP/general_class_balancer@main