Skip to content

Commit

Permalink
comments
Browse files Browse the repository at this point in the history
  • Loading branch information
[hansi thewarapperuma] committed Jan 23, 2024
1 parent 824ce50 commit a4f892f
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 2 deletions.
12 changes: 11 additions & 1 deletion Clin_sig.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
'''
Function: Evaluation through clinical significance criterion
Function: Evaluation through clinical significance criterion (Variant interpretation in clincal context)
Input: Dataframe
Output: A list of tuples containing index and evaluation (clinically_significant, not_clinically_significant, unresolved_clinica_significance, N/A)
Expand All @@ -14,23 +14,33 @@ def evaluate_clinical_significance(input_variants_df):
clnsig_column = 'CLNSIG'
dbnsfp_clnsig_column = 'dbNSFP_clinvar_clnsig'

# allocate customised scores for expected variant interpretations based on priority
pathogenic_score = 1
vus_score = 0.1
benign_score = 0.01

# initiate an empty list to store results
results = []

# iterate through each row of the dataframe
for index, row in input_variants_df.iterrows():

# initially each column values were assigned to 0
clnsig_score, dbnsfp_score = 0, 0

# FOR clnsig_column
# exclude missing values
if str(row[clnsig_column]).lower() != 'n/a':

# when the condition is true; the clnsig_score is assigned with corresponding variant interpretation score
if 'pathogenic' in str(row[clnsig_column]).lower():
clnsig_score = pathogenic_score
elif 'benign' in str(row[clnsig_column]).lower():
clnsig_score = benign_score
elif 'uncertain_significance' in str(row[clnsig_column]).lower():
clnsig_score = vus_score

# FOR dbnsfp_column
if str(row[dbnsfp_clnsig_column]).lower() != 'n/a':
if 'pathogenic' in str(row[dbnsfp_clnsig_column]).lower():
dbnsfp_score = pathogenic_score
Expand Down
4 changes: 4 additions & 0 deletions Cons_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ def conservation_scores(input_variants_df):
false_count = sum(value is False for value in conserved_count)
na_count = len(conserved_count) - true_count - false_count

# TRUE implies the condition is satisfied, i.e. conserved
# FALSE implies the condition is satisfied, i.e. not_conserved
# [if pd.notna..] was included due to the avaiability of N/A (seperate na_count was taken)

# ************* ADOPTING MAJORITY VOTING ALGORITHM *****************

# Use simple if-else logic to determine conservation status
Expand Down
10 changes: 10 additions & 0 deletions Func_pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,20 @@ def in_silico_functional_predictions(input_variants_df):
cadd_column = 'dbNSFP_CADD_phred_hg19'
fathmm_column = 'dbNSFP_FATHMM_pred'

# create a list including the above-mentioned columns
func_pred_columns = [polyphen_column, sift_column, mutationtaster_column, provean_column, cadd_column,
fathmm_column]

# initiate an empty list to store results
results = []

# Iterate through each row of the dataframe
for index, row in input_variants_df.iterrows():

# Check for N/A values
na_count = row[func_pred_columns].isna().sum()

# when all the column values are not 'N/A' ; check the conditions to determine the deleteriousness
if na_count < len(func_pred_columns): # If there is at least one non-N/A value
func_pred_count = [
'D' in str(row[polyphen_column]) or 'P' in str(row[polyphen_column]) if pd.notna(
Expand All @@ -43,6 +48,11 @@ def in_silico_functional_predictions(input_variants_df):
false_count = sum(value is False for value in func_pred_count)
na_count = len(func_pred_count) - true_count - false_count

# TRUE implies the condition is satisfied, i.e. deleterious
# FALSE implies the condition is satisfied, i.e. not_deleterious
# [if pd.notna..] was included due to the avaiability of N/A (seperate na_count was taken)


# ************* ADOPTING MAJORITY VOTING ALGORITHM *****************

# Use simple if-else logic to determine conservation status
Expand Down
8 changes: 8 additions & 0 deletions Gen_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,23 @@
import pandas as pd

def evaluate_genomic_context(input_variants_df):

# define the relevant columns
impact_column = 'ANN[0].IMPACT'
ontology_terms_column = 'ANN[0].ANNOTATION'

# initiate an empty list to store results
results = []

# Iterate through each row of the dataframe
for index, row in input_variants_df.iterrows():

# Check if the mentioned columns have non-null values
if pd.notna(row[impact_column]) and pd.notna(row[ontology_terms_column]):

# consider onlu high and moderate putative imapact ( neither LOW nor MODIFIER )
if 'HIGH' in row[impact_column] or 'MODERATE' in row[impact_column]:

# Check if 'ANN[0].ANNOTATION' is not one of the specified values
if row[ontology_terms_column] not in ['synonymous_variant', 'intergenic_region', 'intron_variant',
'intragenic_variant', 'start_retained_variant', 'stop_retained_variant',
Expand Down
3 changes: 2 additions & 1 deletion MAF.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def evaluate_minor_allele_freq(input_variants_df):
'AF_nfe_onf', 'AF_eas_oea', 'AF_nfe_nwe', 'AF_nfe_seu', 'AF_nfe_swe',
'AF_eas_jpn', 'AF_eas_kor', 'AF_fin', 'AF_asj', 'AF_nfe_est', 'AF_oth']]

# initiate an empty list to store final results
results = []

# Iterate through each row of the dataframe
Expand All @@ -30,7 +31,7 @@ def evaluate_minor_allele_freq(input_variants_df):
# Convert non-missing values to numerics
numeric_values = [pd.to_numeric(value, errors='coerce') for value in row]

#*********** IMPLEMENTATION OF SIMPLE ENSEMBLE LEARNING METHOD USING AVERAGE ***********
#*********** IMPLEMENTATION OF ENSEMBLE AVERAGING METHOD ***********

# Count occurrences of unique values for the row
value_counts = pd.Series(numeric_values).value_counts()
Expand Down
8 changes: 8 additions & 0 deletions Quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

def evaluate_quality(input_variants_df):

# define relevant columns
filter_column = 'FILTER'
gq_column = 'GEN[*].GQ'
qual_column = "QUAL"
Expand All @@ -12,14 +13,21 @@ def evaluate_quality(input_variants_df):
numeric_columns = [gq_column, qual_column, ad_column, dp_column]
input_variants_df[numeric_columns] = input_variants_df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# initiate an empty list to store results
results = []

# Iterate through each row of the dataframe
for index, row in input_variants_df.iterrows():

# Check if all the mentioned columns have 'N/A' values
all_na = all(pd.isna(row[col]) or str(row[col]) == 'N/A' for col in numeric_columns)

# At least one non-'N/A' value in the mentioned columns
if not all_na:

# low quality rows are not considered
if 'LowQual' not in row[filter_column]:

# Cutoff quality is set to 20 (Phred-scaled)
if row[gq_column] >= 20:
results.append((index, "quality"))
Expand Down

0 comments on commit a4f892f

Please sign in to comment.