Skip to content

Commit

Permalink
Merge pull request PGScatalog#339 from fyvon/improve/qc_reported_trai…
Browse files Browse the repository at this point in the history
…ts_v2

Highlight and register new reported traits during import
  • Loading branch information
fyvon authored Mar 21, 2024
2 parents 3869bc7 + fb48ad2 commit 9fcab09
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 23 deletions.
6 changes: 4 additions & 2 deletions curation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@
}

# TSV file containing the reported traits to be replaced for homogeneity.
# Required columns: "trait_reported", "corrected".
reported_traits_replacement_file = '<local_dir>/reported_traits_dict.tsv'
# Required columns: "trait_reported", "corrected", optional: "date_added".
reported_traits_cleaning_config = {
'replacement_file': '<local_dir>/reported_traits_dict.tsv'
}
8 changes: 7 additions & 1 deletion curation/import_studies.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from curation.imports.curation import CurationImport
from curation.config import *
from curation.imports.reported_trait_cleaner import ReportedTraitCleaner

reported_traits_cleaner = ReportedTraitCleaner(reported_traits_replacement_file=reported_traits_cleaning_config['replacement_file'])

# Main script
curation_import = CurationImport(
data_path=curation_directories, studies_list=study_names_list, curation_status_by_default=default_curation_status,
scoringfiles_format_version=scoringfiles_format_version, skip_scoringfiles=skip_scoringfiles,
skip_curationtracker=skip_curationtracker, variant_positions_qc_config=variant_positions_qc_config,
reported_traits_dict_file=reported_traits_replacement_file)
reported_traits_cleaner=reported_traits_cleaner)
curation_import.run_curation_import()

# Saving the reported trait cleaner for potential new terms
reported_traits_cleaner.export(curation_directories['studies_dir']+'/reported_traits_cleaner.tsv')
14 changes: 5 additions & 9 deletions curation/imports/curation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
import csv

from curation.imports.reported_trait_cleaner import ReportedTraitCleaner
from curation.imports.study import StudyImport
from curation.imports.scoring_file import ScoringFileUpdate, VariantPositionsQC
from curation_tracker.models import CurationPublicationAnnotation
Expand All @@ -18,7 +20,8 @@ class CurationImport():

failed_studies = {}

def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles, skip_curationtracker, variant_positions_qc_config, reported_traits_dict_file:str):
def __init__(self, data_path, studies_list, curation_status_by_default, scoringfiles_format_version, skip_scoringfiles,
skip_curationtracker, variant_positions_qc_config, reported_traits_cleaner: ReportedTraitCleaner):
self.curation2schema = pd.read_excel(data_path['template_schema'], sheet_name='Curation', index_col=0)
self.curation2schema_scoring = pd.read_excel(data_path['scoring_schema'], sheet_name='Columns', index_col=0)

Expand All @@ -33,14 +36,7 @@ def __init__(self, data_path, studies_list, curation_status_by_default, scoringf
self.curation_status_by_default = curation_status_by_default
self.variant_positions_qc_config = variant_positions_qc_config

# Reading the reported-traits dictionary file
try:
with open(reported_traits_dict_file, mode='r') as infile:
reader = csv.DictReader(infile, delimiter='\t')
self.reported_traits_cleaner = {row['trait_reported']: row['corrected'] for row in reader}
except FileNotFoundError:
print('ERROR: Could not find \'reported_traits_dict_file\'')
self.reported_traits_cleaner = {}
self.reported_traits_cleaner = reported_traits_cleaner

self.step = 1
self.steps_total = 2
Expand Down
77 changes: 77 additions & 0 deletions curation/imports/reported_trait_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import csv
from collections import OrderedDict
from datetime import date


class ReportedTraitCleaner:
"""Class handling the cleaning of reported traits in imported studies."""

def __init__(self, reported_traits_replacement_file: str):
"""
Constructor. Requires the path to the file containing the mapping between known reported traits and their cleaned version.
The file must be tab-delimited with the required columns 'trait_reported' and 'corrected'. Optional column: 'date_added'.
"""
self.reported_traits_replacement_file = reported_traits_replacement_file
try:
with open(reported_traits_replacement_file, mode='r') as infile:
reader = csv.DictReader(infile, delimiter='\t')
self.reported_traits_dict = OrderedDict()
for row in reader:
trait_reported = row['trait_reported']
trait_corrected = row['corrected']
date_added = row['date_added'] if 'date_added' in row else ''
self.add_trait(submitted_trait=trait_reported, corrected_trait=trait_corrected, date_added=date_added)
except FileNotFoundError as e:
print('ERROR: Could not find \'reported_traits_dict_file\'')
raise e

def clean_trait(self, submitted_trait: str) -> str:
"""
Returns the cleaned version of the submitted trait if it exists and is defined, otherwise the submitted trait is
returned unchanged and is added to the cleaner database.
"""
if not self.__is_known_trait(submitted_trait):
print('New reported trait "{}"'.format(submitted_trait))
self.add_trait(submitted_trait=submitted_trait, corrected_trait='', date_added=str(date.today()), is_new=True)

corrected_trait = self.__get_corrected_trait(submitted_trait)
return corrected_trait if corrected_trait else submitted_trait

def add_trait(self, submitted_trait: str, corrected_trait: str = '', date_added: str = '', is_new: bool = False) -> None:
"""
Adds the given submitted trait to the cleaner database and their corrected version if exists.
"""
self.reported_traits_dict[submitted_trait] = dict()
self.reported_traits_dict[submitted_trait]['corrected'] = corrected_trait
self.reported_traits_dict[submitted_trait]['date_added'] = date_added
self.reported_traits_dict[submitted_trait]['is_new'] = is_new

def __get_corrected_trait(self, submitted_trait: str) -> str:
return self.reported_traits_dict[submitted_trait]['corrected']

def __is_known_trait(self, submitted_trait: str) -> bool:
return submitted_trait in self.reported_traits_dict

def export(self, exported_file: str):
"""
Save the content of the cleaner database, including the added new traits, to a new file (tab-separated) which can be used for future imports.
"""
new_traits = []
with open(exported_file, mode='w') as outfile:
writer = csv.writer(outfile, delimiter='\t',
lineterminator='\n',
quotechar='"',
quoting=csv.QUOTE_ALL
)
writer.writerow(['trait_reported', 'corrected', 'date_added'])
for trait_reported in self.reported_traits_dict:
corrected = self.reported_traits_dict[trait_reported]['corrected']
date_added = self.reported_traits_dict[trait_reported]['date_added']
writer.writerow([trait_reported, corrected, date_added])
if self.reported_traits_dict[trait_reported]['is_new']:
new_traits.append(trait_reported)
print('Updated reported trait cleaner saved to "{}"'.format(exported_file))
print('New traits: {}'.format(str(new_traits)))

def __contains__(self, item):
return self.__is_known_trait(item)
3 changes: 2 additions & 1 deletion curation/imports/study.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# import gzip
# import pandas as pd
# import numpy as np
from curation.imports.reported_trait_cleaner import ReportedTraitCleaner
from curation.template_parser import *
from catalog.models import *

Expand All @@ -17,7 +18,7 @@ class StudyImport():
('sample', 'id', Sample)
]

def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default, reported_traits_cleaner:dict):
def __init__(self, study_data, studies_dir, curation_schema, curation_status_by_default, reported_traits_cleaner: ReportedTraitCleaner):
self.study_name = study_data['name']

if not studies_dir.endswith('/'):
Expand Down
12 changes: 7 additions & 5 deletions curation/parsers/score.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from django.db import IntegrityError, transaction

from curation.imports.reported_trait_cleaner import ReportedTraitCleaner
from curation.parsers.generic import GenericData
from curation.parsers.trait import TraitData
from catalog.models import Score
Expand All @@ -19,7 +21,7 @@ def __init__(self, score_name, spreadsheet_name):
self.data = {'name': score_name}

@transaction.atomic
def create_score_model(self, publication, reported_traits_cleaner: dict):
def create_score_model(self, publication, reported_traits_cleaner: ReportedTraitCleaner):
'''
Create an instance of the Score model.
It also create instance(s) of the EFOTrait model if needed.
Expand All @@ -43,10 +45,10 @@ def create_score_model(self, publication, reported_traits_cleaner: dict):
if val in self.method_name_replacement.keys():
val = self.method_name_replacement[val]
elif field == 'trait_reported':
if val in reported_traits_cleaner:
new_val = reported_traits_cleaner[val]
print("Replaced reported trait \"{}\" with \"{}\"".format(val, new_val))
val = new_val
cleaned_val = reported_traits_cleaner.clean_trait(val)
if cleaned_val != val:
print("Replaced reported trait \"{}\" with \"{}\"".format(val, cleaned_val))
val = cleaned_val
setattr(self.model, field, val)
# Associate a Publication
self.model.publication = publication
Expand Down
4 changes: 3 additions & 1 deletion curation/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ variant_positions_qc_config = {
'minimum_match_rate': 0.9
}
reported_traits_replacement_file = '/Users/myhome/PGS/reported_traits_dict.tsv'
reported_traits_cleaning_config = {
'replacement_file': '<local_dir>/reported_traits_dict.tsv'
}
```

#### Additional attributes for the study_names_list
Expand Down
10 changes: 6 additions & 4 deletions curation/tests/test_import.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from django.test import TestCase
from curation.imports.curation import CurationImport
from catalog.models import *


from curation.imports.reported_trait_cleaner import ReportedTraitCleaner

# Configuration
curation_directories = {
Expand Down Expand Up @@ -30,7 +29,9 @@
'skip': True
}

reported_traits_replacement_file = './curation/tests/test_files/reported_traits_dict.tsv'
reported_traits_cleaning_config = {
'replacement_file': './curation/tests/test_files/reported_traits_dict.tsv'
}

# Test values
data_counts = {
Expand All @@ -46,11 +47,12 @@ class ImportTest(TestCase):

def run_import(self):
# Main script
reported_trait_cleaner = ReportedTraitCleaner(reported_traits_replacement_file=reported_traits_cleaning_config['replacement_file'])
curation_import = CurationImport(
data_path=curation_directories, studies_list=study_names_list, curation_status_by_default=default_curation_status,
scoringfiles_format_version=scoringfiles_format_version, skip_scoringfiles=skip_scorefiles,
skip_curationtracker=skip_curationtracker, variant_positions_qc_config=variant_positions_qc_config,
reported_traits_dict_file=reported_traits_replacement_file)
reported_traits_cleaner=reported_trait_cleaner)
curation_import.run_curation_import()


Expand Down

0 comments on commit 9fcab09

Please sign in to comment.