diff --git a/code/ukbb.py b/code/ukbb.py index 1a8055d..7c46109 100644 --- a/code/ukbb.py +++ b/code/ukbb.py @@ -1,12 +1,15 @@ """Load uk biobank data and extract demographic information. -Author: Hao-Ting Wang; last edit 2024-02-02 +Author: Hao-Ting Wang; last edit 2024-07-11 All input stored in `data/ukbb` folder. The content of `data` is not included in the repository. Labels are cureated from the ukbb website and saved to `data/ukbb`. This is public information hence it is not included in the repository. +You will need to download coding9, coding10, and coding19. +For example, coding9 can be downloaded from: +https://biobank.ndph.ox.ac.uk/ukb/coding.cgi?id=9 data_file is provided by SJ's lab and requires DUA, hence it is not included in the repository. @@ -14,11 +17,34 @@ To expand this tool, add more information to `info` with information from Data_fields xlsx file (provided by SJ's lab and requires DUA). +The output is saved in the output directory as `ukbb_pheno.tsv` and +`ukbb_pheno.json`. The json file contains the meta information for each +field in the tsv file. + +The script also takes a qc file as input. The qc file is generated by +giga_auto_qc and contains the qc information for each participant. +The qc file is used to filter out participants that did not pass qc. +https://github.com/SIMEXP/giga_auto_qc + +The script also takes a list of confound columns as input. The confound +columns are used to balance the sample for each diagnosis group. The +sample is balanced using the general_class_balancer: +https://github.com/mleming/general_class_balancer +Packaged version: +https://github.com/SIMEXP/general_class_balancer +This is the same procedure as AH's multi-site analysis: +https://github.com/harveyaa/neuropsych_mtl/blob/master/conf_balancing + +The script also outputs a json file with the balanced sample for each +diagnosis group. The json file is saved as `downstream_balanced_sample.json`. """ from pathlib import Path import pandas as pd import json import argparse +import seaborn as sns +import matplotlib.pyplot as plt +from general_class_balancer import general_class_balancer as gcb # phenotypic information curated through ukbb website info = { @@ -45,9 +71,135 @@ 'description': 'first repeat imaging visit', } } + }, + 'diagnosis':{ + 'id': 41270, + 'description': 'Diagnoses - International Classification of Disease version 10 (ICD10)', + 'labels': 'coding19.tsv', # download from ukbb website + 'instance':{ + 'F00': { + 'label': 'ADD', + 'description': 'Alzheimer disease - Dementia', + }, + 'F10': { + 'label': 'ALCO', + 'description': 'Alcohol Abuse', + }, + 'F20': { + 'label': 'SCZ', + 'description': 'Schizophrenia', + }, + 'F31': { + 'label': 'BIPOLAR', + 'description': 'Bipolar disorder', + }, + 'F32': { + 'label': 'DEP', + 'description': 'Depressive disorder', + }, + 'G20': { + 'label': 'PARK', + 'description': 'Parkinson', + }, + 'G30': { + 'label': 'ADD', + 'description': 'Alzheimer disease - Dementia', + }, + 'G35': { + 'label': 'MS', + 'description': 'Multiple sclerosis', + }, + 'G40': { + 'label': 'EPIL', + 'description': 'Epilepsy', + }, + } + }, + 'per-group':{ + 'description': 'Mathcing control vs disease per group.', } } +def read_ukbb_diagnosis_per_group(data_file): + # read data_file with subject id and all columns started with f.41270 + data = pd.read_csv(data_file, sep='\t', na_values='NA', index_col=0, low_memory=False) + data = data.filter(regex='^f.41270', axis=1) + meta_data = {} + for idx, row in data.iterrows(): + row = row.dropna() + # reduce the value to the first three characters + row = row.apply(lambda x: x[:3]) + row = row.tolist() + for diagnosis_code in info['diagnosis']['instance']: + diagnosis_label = info['diagnosis']['instance'][diagnosis_code]['label'] + data.loc[idx, diagnosis_label] = 0 + if diagnosis_code in row: + data.loc[idx, diagnosis_label] = 1 + if diagnosis_label not in meta_data: + meta_data[diagnosis_label] = { + 'fid': diagnosis_label, + 'description': diagnosis_label, + 'instance':{ + 0: { + 'name': 'control', + 'description': 'Healthy control, matched sex, age, and site with the disease group by general class balancer.', + }, + 1: { + 'name': diagnosis_label, + 'description': info['diagnosis']['instance'][diagnosis_code]['description'], + } + } + } + diagnosis_groups = [k for k in meta_data.keys() if "_sample" not in k] + return data[diagnosis_groups], meta_data + + +def read_ukbb_general_diagnosis(data_file): + # read data_file with subject id and all columns started with f.41270 + data = pd.read_csv(data_file, sep='\t', na_values='NA', index_col=0, low_memory=False) + data = data.filter(regex='^f.41270', axis=1) + for idx, row in data.iterrows(): + row = row.dropna() + # reduce the value to the first three characters + row = row.apply(lambda x: x[:3]) + row = row.tolist() + if len(row) > 0: + # assign as the default as control + data.loc[idx, 'icd10'] = row[0] + data.loc[idx, 'n_icd10'] = len(row) + data.loc[idx, 'diagnosis'] = 'HC' + # take the first value that matches diagnosis of interest + while row: + label = row.pop(0) + if label in info['diagnosis']['instance']: + data.loc[idx, 'diagnosis'] = info['diagnosis']['instance'][label]['label'] + data.loc[idx, 'icd10'] = label + break + else: + # no history of diagnosis at all wow + data.loc[idx, 'n_icd10'] = 0 + data.loc[idx, 'icd10'] = None + data.loc[idx, 'diagnosis'] = 'HC' + + meta_data = { + 'diagnosis': { + 'fid': 'f.41270.x', + 'description': info['diagnosis']['description'], + 'labels': { # curated from ukbb website + 'ADD': 'Alzheimer disease - Dementia', + 'ALCO': 'Alcohol Abuse', + 'DEP': 'Depressive disorder', + 'SCZ': 'Schizophrenia', + 'BIPOLAR': 'Bipolar disorder', + 'PARK': 'Parkinson', + 'MS': 'Multiple sclerosis', + 'EPIL': 'Epilepsy', + 'HC': 'Healthy control', + } + } + } + return data[['diagnosis', 'icd10', 'n_icd10']], meta_data + def read_ukbb_data(data_file, info_label): @@ -57,6 +209,11 @@ def read_ukbb_data(data_file, info_label): f'{list(info.keys())}' ) + # diagnostic data has to be handled differently + if info_label == 'diagnosis': + return read_ukbb_general_diagnosis(data_file) + if info_label == 'per-group': + return read_ukbb_diagnosis_per_group(data_file) # compile meta data and construct id in the datafile (fid) # format: f...0 # not sure the field for 0 is for, but it is always 0 @@ -111,11 +268,14 @@ def read_ukbb_data(data_file, info_label): # use argparse to get datafile parser = argparse.ArgumentParser(description='Extract ukbb data') parser.add_argument('datafile', type=Path, help='ukbb data file') + parser.add_argument('qcfile', type=Path, help='giga auto qc output file') parser.add_argument('output', type=Path, help='output directory') + parser.add_argument('--confounds', type=str, nargs="+", help='confound columns', default=list()) args = parser.parse_args() data_file = args.datafile output_dir = args.output + confounds = args.confounds curated_data = [] curated_meta = {} @@ -124,7 +284,37 @@ def read_ukbb_data(data_file, info_label): curated_data.append(data) curated_meta.update(meta_data) curated_data = pd.concat(curated_data, axis=1) + curated_data.index.name = 'participant_id' + curated_data = curated_data[~curated_data.index.duplicated(keep='first')] - curated_data.to_csv(output_dir / 'ukbb_pheno.tsv', sep='\t') + # add qc information + qc = pd.read_csv(args.qcfile, sep='\t', index_col='participant_id')[['mean_fd_raw', 'proportion_kept', 'pass_all_qc']] + qc = qc[~qc.index.duplicated(keep='first')] + curated_data = pd.concat([qc, curated_data], axis=1, join='inner') + passed_qc = curated_data[curated_data['pass_all_qc'] == True] + + # save the data + passed_qc.to_csv(output_dir / 'ukbb_pheno.tsv', sep='\t') with open(output_dir / 'ukbb_pheno.json', 'w') as f: json.dump(curated_meta, f, indent=2) + + # create a matched sample for each diagnosis group + if len(confounds) > 0: + downstreams = {} + diagnosis_groups = list(curated_meta['diagnosis']['labels'].keys()) + diagnosis_groups.remove('HC') + for d in diagnosis_groups: + select_sample = gcb.class_balance(passed_qc[d].values.astype(int), passed_qc[confounds].values.T) + downstreams[d] = passed_qc.index[select_sample].tolist() + with open(output_dir / 'downstream_balanced_sample.json', 'w') as f: + json.dump(downstreams, f, indent=2) + + # plot the distribution of confounds + for d in diagnosis_groups: + subjects = downstreams[d] + df = passed_qc.loc[subjects, :] + fig, axes = plt.subplots(1, 4, figsize=(20, 5)) + fig.suptitle(f"Confound blanced sample (N={len(subjects)}): " + curated_meta[d]['instance'][1]['description']) + for ax, c in zip(axes, ['sex', 'age', 'mean_fd_raw', 'proportion_kept']): + sns.histplot(x=c, data=df, hue=d, kde=True, ax=ax) + fig.savefig(output_dir / f'distribution_balanced_sample_{d}.png') diff --git a/requirements.txt b/requirements.txt index 69d9c3d..1af2d80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,4 @@ +numpy pandas[excel] +seaborn +general_class_balancer @ git+https://github.com/SIMEXP/general_class_balancer@main