Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD diagnosis info to ukbb #26

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 192 additions & 2 deletions code/ukbb.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,50 @@
"""Load uk biobank data and extract demographic information.

Author: Hao-Ting Wang; last edit 2024-02-02
Author: Hao-Ting Wang; last edit 2024-07-11

All input stored in `data/ukbb` folder. The content of `data` is not
included in the repository.

Labels are cureated from the ukbb website and saved to `data/ukbb`.
This is public information hence it is not included in the repository.
You will need to download coding9, coding10, and coding19.
For example, coding9 can be downloaded from:
https://biobank.ndph.ox.ac.uk/ukb/coding.cgi?id=9

data_file is provided by SJ's lab and requires DUA,
hence it is not included in the repository.

To expand this tool, add more information to `info` with information
from Data_fields xlsx file (provided by SJ's lab and requires DUA).

The output is saved in the output directory as `ukbb_pheno.tsv` and
`ukbb_pheno.json`. The json file contains the meta information for each
field in the tsv file.

The script also takes a qc file as input. The qc file is generated by
giga_auto_qc and contains the qc information for each participant.
The qc file is used to filter out participants that did not pass qc.
https://github.com/SIMEXP/giga_auto_qc

The script also takes a list of confound columns as input. The confound
columns are used to balance the sample for each diagnosis group. The
sample is balanced using the general_class_balancer:
https://github.com/mleming/general_class_balancer
Packaged version:
https://github.com/SIMEXP/general_class_balancer
This is the same procedure as AH's multi-site analysis:
https://github.com/harveyaa/neuropsych_mtl/blob/master/conf_balancing

The script also outputs a json file with the balanced sample for each
diagnosis group. The json file is saved as `downstream_balanced_sample.json`.
"""
from pathlib import Path
import pandas as pd
import json
import argparse
import seaborn as sns
import matplotlib.pyplot as plt
from general_class_balancer import general_class_balancer as gcb

# phenotypic information curated through ukbb website
info = {
Expand All @@ -45,9 +71,135 @@
'description': 'first repeat imaging visit',
}
}
},
'diagnosis':{
'id': 41270,
'description': 'Diagnoses - International Classification of Disease version 10 (ICD10)',
'labels': 'coding19.tsv', # download from ukbb website
'instance':{
'F00': {
'label': 'ADD',
'description': 'Alzheimer disease - Dementia',
},
'F10': {
'label': 'ALCO',
'description': 'Alcohol Abuse',
},
'F20': {
'label': 'SCZ',
'description': 'Schizophrenia',
},
'F31': {
'label': 'BIPOLAR',
'description': 'Bipolar disorder',
},
'F32': {
'label': 'DEP',
'description': 'Depressive disorder',
},
'G20': {
'label': 'PARK',
'description': 'Parkinson',
},
'G30': {
'label': 'ADD',
'description': 'Alzheimer disease - Dementia',
},
'G35': {
'label': 'MS',
'description': 'Multiple sclerosis',
},
'G40': {
'label': 'EPIL',
'description': 'Epilepsy',
},
}
},
'per-group':{
'description': 'Mathcing control vs disease per group.',
}
}

def read_ukbb_diagnosis_per_group(data_file):
# read data_file with subject id and all columns started with f.41270
data = pd.read_csv(data_file, sep='\t', na_values='NA', index_col=0, low_memory=False)
data = data.filter(regex='^f.41270', axis=1)
meta_data = {}
for idx, row in data.iterrows():
row = row.dropna()
# reduce the value to the first three characters
row = row.apply(lambda x: x[:3])
row = row.tolist()
for diagnosis_code in info['diagnosis']['instance']:
diagnosis_label = info['diagnosis']['instance'][diagnosis_code]['label']
data.loc[idx, diagnosis_label] = 0
if diagnosis_code in row:
data.loc[idx, diagnosis_label] = 1
if diagnosis_label not in meta_data:
meta_data[diagnosis_label] = {
'fid': diagnosis_label,
'description': diagnosis_label,
'instance':{
0: {
'name': 'control',
'description': 'Healthy control, matched sex, age, and site with the disease group by general class balancer.',
},
1: {
'name': diagnosis_label,
'description': info['diagnosis']['instance'][diagnosis_code]['description'],
}
}
}
diagnosis_groups = [k for k in meta_data.keys() if "_sample" not in k]
return data[diagnosis_groups], meta_data


def read_ukbb_general_diagnosis(data_file):
# read data_file with subject id and all columns started with f.41270
data = pd.read_csv(data_file, sep='\t', na_values='NA', index_col=0, low_memory=False)
data = data.filter(regex='^f.41270', axis=1)
for idx, row in data.iterrows():
row = row.dropna()
# reduce the value to the first three characters
row = row.apply(lambda x: x[:3])
row = row.tolist()
if len(row) > 0:
# assign as the default as control
data.loc[idx, 'icd10'] = row[0]
data.loc[idx, 'n_icd10'] = len(row)
data.loc[idx, 'diagnosis'] = 'HC'
# take the first value that matches diagnosis of interest
while row:
label = row.pop(0)
if label in info['diagnosis']['instance']:
data.loc[idx, 'diagnosis'] = info['diagnosis']['instance'][label]['label']
data.loc[idx, 'icd10'] = label
break
else:
# no history of diagnosis at all wow
data.loc[idx, 'n_icd10'] = 0
data.loc[idx, 'icd10'] = None
data.loc[idx, 'diagnosis'] = 'HC'

meta_data = {
'diagnosis': {
'fid': 'f.41270.x',
'description': info['diagnosis']['description'],
'labels': { # curated from ukbb website
'ADD': 'Alzheimer disease - Dementia',
'ALCO': 'Alcohol Abuse',
'DEP': 'Depressive disorder',
'SCZ': 'Schizophrenia',
'BIPOLAR': 'Bipolar disorder',
'PARK': 'Parkinson',
'MS': 'Multiple sclerosis',
'EPIL': 'Epilepsy',
'HC': 'Healthy control',
}
}
}
return data[['diagnosis', 'icd10', 'n_icd10']], meta_data


def read_ukbb_data(data_file, info_label):

Expand All @@ -57,6 +209,11 @@ def read_ukbb_data(data_file, info_label):
f'{list(info.keys())}'
)

# diagnostic data has to be handled differently
if info_label == 'diagnosis':
return read_ukbb_general_diagnosis(data_file)
if info_label == 'per-group':
return read_ukbb_diagnosis_per_group(data_file)
# compile meta data and construct id in the datafile (fid)
# format: f.<id>.<instance>.0
# not sure the field for 0 is for, but it is always 0
Expand Down Expand Up @@ -111,11 +268,14 @@ def read_ukbb_data(data_file, info_label):
# use argparse to get datafile
parser = argparse.ArgumentParser(description='Extract ukbb data')
parser.add_argument('datafile', type=Path, help='ukbb data file')
parser.add_argument('qcfile', type=Path, help='giga auto qc output file')
parser.add_argument('output', type=Path, help='output directory')
parser.add_argument('--confounds', type=str, nargs="+", help='confound columns', default=list())

args = parser.parse_args()
data_file = args.datafile
output_dir = args.output
confounds = args.confounds

curated_data = []
curated_meta = {}
Expand All @@ -124,7 +284,37 @@ def read_ukbb_data(data_file, info_label):
curated_data.append(data)
curated_meta.update(meta_data)
curated_data = pd.concat(curated_data, axis=1)
curated_data.index.name = 'participant_id'
curated_data = curated_data[~curated_data.index.duplicated(keep='first')]

curated_data.to_csv(output_dir / 'ukbb_pheno.tsv', sep='\t')
# add qc information
qc = pd.read_csv(args.qcfile, sep='\t', index_col='participant_id')[['mean_fd_raw', 'proportion_kept', 'pass_all_qc']]
qc = qc[~qc.index.duplicated(keep='first')]
curated_data = pd.concat([qc, curated_data], axis=1, join='inner')
passed_qc = curated_data[curated_data['pass_all_qc'] == True]

# save the data
passed_qc.to_csv(output_dir / 'ukbb_pheno.tsv', sep='\t')
with open(output_dir / 'ukbb_pheno.json', 'w') as f:
json.dump(curated_meta, f, indent=2)

# create a matched sample for each diagnosis group
if len(confounds) > 0:
downstreams = {}
diagnosis_groups = list(curated_meta['diagnosis']['labels'].keys())
diagnosis_groups.remove('HC')
for d in diagnosis_groups:
select_sample = gcb.class_balance(passed_qc[d].values.astype(int), passed_qc[confounds].values.T)
downstreams[d] = passed_qc.index[select_sample].tolist()
with open(output_dir / 'downstream_balanced_sample.json', 'w') as f:
json.dump(downstreams, f, indent=2)

# plot the distribution of confounds
for d in diagnosis_groups:
subjects = downstreams[d]
df = passed_qc.loc[subjects, :]
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
fig.suptitle(f"Confound blanced sample (N={len(subjects)}): " + curated_meta[d]['instance'][1]['description'])
for ax, c in zip(axes, ['sex', 'age', 'mean_fd_raw', 'proportion_kept']):
sns.histplot(x=c, data=df, hue=d, kde=True, ax=ax)
fig.savefig(output_dir / f'distribution_balanced_sample_{d}.png')
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
numpy
pandas[excel]
seaborn
general_class_balancer @ git+https://github.com/SIMEXP/general_class_balancer@main