From a4f892fbd96c3ac00b00f5829d9082044a853cbf Mon Sep 17 00:00:00 2001 From: "[hansi thewarapperuma]" <[hansithewarapperuma@gmail.com]> Date: Tue, 23 Jan 2024 10:36:48 +0530 Subject: [PATCH] comments --- Clin_sig.py | 12 +++++++++++- Cons_score.py | 4 ++++ Func_pred.py | 10 ++++++++++ Gen_context.py | 8 ++++++++ MAF.py | 3 ++- Quality.py | 8 ++++++++ 6 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Clin_sig.py b/Clin_sig.py index 1119abf..5849069 100644 --- a/Clin_sig.py +++ b/Clin_sig.py @@ -1,5 +1,5 @@ ''' -Function: Evaluation through clinical significance criterion +Function: Evaluation through clinical significance criterion (Variant interpretation in clincal context) Input: Dataframe Output: A list of tuples containing index and evaluation (clinically_significant, not_clinically_significant, unresolved_clinica_significance, N/A) @@ -14,16 +14,25 @@ def evaluate_clinical_significance(input_variants_df): clnsig_column = 'CLNSIG' dbnsfp_clnsig_column = 'dbNSFP_clinvar_clnsig' + # allocate customised scores for expected variant interpretations based on priority pathogenic_score = 1 vus_score = 0.1 benign_score = 0.01 + # initiate an empty list to store results results = [] + # iterate through each row of the dataframe for index, row in input_variants_df.iterrows(): + + # initially each column values were assigned to 0 clnsig_score, dbnsfp_score = 0, 0 + # FOR clnsig_column + # exclude missing values if str(row[clnsig_column]).lower() != 'n/a': + + # when the condition is true; the clnsig_score is assigned with corresponding variant interpretation score if 'pathogenic' in str(row[clnsig_column]).lower(): clnsig_score = pathogenic_score elif 'benign' in str(row[clnsig_column]).lower(): @@ -31,6 +40,7 @@ def evaluate_clinical_significance(input_variants_df): elif 'uncertain_significance' in str(row[clnsig_column]).lower(): clnsig_score = vus_score + # FOR dbnsfp_column if str(row[dbnsfp_clnsig_column]).lower() != 'n/a': if 'pathogenic' in str(row[dbnsfp_clnsig_column]).lower(): dbnsfp_score = pathogenic_score diff --git a/Cons_score.py b/Cons_score.py index c54ce97..3b9a475 100644 --- a/Cons_score.py +++ b/Cons_score.py @@ -48,6 +48,10 @@ def conservation_scores(input_variants_df): false_count = sum(value is False for value in conserved_count) na_count = len(conserved_count) - true_count - false_count + # TRUE implies the condition is satisfied, i.e. conserved + # FALSE implies the condition is satisfied, i.e. not_conserved + # [if pd.notna..] was included due to the avaiability of N/A (seperate na_count was taken) + # ************* ADOPTING MAJORITY VOTING ALGORITHM ***************** # Use simple if-else logic to determine conservation status diff --git a/Func_pred.py b/Func_pred.py index ec15a2a..cb4127f 100644 --- a/Func_pred.py +++ b/Func_pred.py @@ -18,15 +18,20 @@ def in_silico_functional_predictions(input_variants_df): cadd_column = 'dbNSFP_CADD_phred_hg19' fathmm_column = 'dbNSFP_FATHMM_pred' + # create a list including the above-mentioned columns func_pred_columns = [polyphen_column, sift_column, mutationtaster_column, provean_column, cadd_column, fathmm_column] + # initiate an empty list to store results results = [] + # Iterate through each row of the dataframe for index, row in input_variants_df.iterrows(): + # Check for N/A values na_count = row[func_pred_columns].isna().sum() + # when all the column values are not 'N/A' ; check the conditions to determine the deleteriousness if na_count < len(func_pred_columns): # If there is at least one non-N/A value func_pred_count = [ 'D' in str(row[polyphen_column]) or 'P' in str(row[polyphen_column]) if pd.notna( @@ -43,6 +48,11 @@ def in_silico_functional_predictions(input_variants_df): false_count = sum(value is False for value in func_pred_count) na_count = len(func_pred_count) - true_count - false_count + # TRUE implies the condition is satisfied, i.e. deleterious + # FALSE implies the condition is satisfied, i.e. not_deleterious + # [if pd.notna..] was included due to the avaiability of N/A (seperate na_count was taken) + + # ************* ADOPTING MAJORITY VOTING ALGORITHM ***************** # Use simple if-else logic to determine conservation status diff --git a/Gen_context.py b/Gen_context.py index a1a4283..e9bfbe9 100644 --- a/Gen_context.py +++ b/Gen_context.py @@ -10,15 +10,23 @@ import pandas as pd def evaluate_genomic_context(input_variants_df): + + # define the relevant columns impact_column = 'ANN[0].IMPACT' ontology_terms_column = 'ANN[0].ANNOTATION' + # initiate an empty list to store results results = [] + # Iterate through each row of the dataframe for index, row in input_variants_df.iterrows(): + # Check if the mentioned columns have non-null values if pd.notna(row[impact_column]) and pd.notna(row[ontology_terms_column]): + + # consider onlu high and moderate putative imapact ( neither LOW nor MODIFIER ) if 'HIGH' in row[impact_column] or 'MODERATE' in row[impact_column]: + # Check if 'ANN[0].ANNOTATION' is not one of the specified values if row[ontology_terms_column] not in ['synonymous_variant', 'intergenic_region', 'intron_variant', 'intragenic_variant', 'start_retained_variant', 'stop_retained_variant', diff --git a/MAF.py b/MAF.py index 9f42abe..3bb66f2 100644 --- a/MAF.py +++ b/MAF.py @@ -16,6 +16,7 @@ def evaluate_minor_allele_freq(input_variants_df): 'AF_nfe_onf', 'AF_eas_oea', 'AF_nfe_nwe', 'AF_nfe_seu', 'AF_nfe_swe', 'AF_eas_jpn', 'AF_eas_kor', 'AF_fin', 'AF_asj', 'AF_nfe_est', 'AF_oth']] + # initiate an empty list to store final results results = [] # Iterate through each row of the dataframe @@ -30,7 +31,7 @@ def evaluate_minor_allele_freq(input_variants_df): # Convert non-missing values to numerics numeric_values = [pd.to_numeric(value, errors='coerce') for value in row] - #*********** IMPLEMENTATION OF SIMPLE ENSEMBLE LEARNING METHOD USING AVERAGE *********** + #*********** IMPLEMENTATION OF ENSEMBLE AVERAGING METHOD *********** # Count occurrences of unique values for the row value_counts = pd.Series(numeric_values).value_counts() diff --git a/Quality.py b/Quality.py index 766f7d7..e0d01c0 100644 --- a/Quality.py +++ b/Quality.py @@ -2,6 +2,7 @@ def evaluate_quality(input_variants_df): + # define relevant columns filter_column = 'FILTER' gq_column = 'GEN[*].GQ' qual_column = "QUAL" @@ -12,14 +13,21 @@ def evaluate_quality(input_variants_df): numeric_columns = [gq_column, qual_column, ad_column, dp_column] input_variants_df[numeric_columns] = input_variants_df[numeric_columns].apply(pd.to_numeric, errors='coerce') + # initiate an empty list to store results results = [] + # Iterate through each row of the dataframe for index, row in input_variants_df.iterrows(): + # Check if all the mentioned columns have 'N/A' values all_na = all(pd.isna(row[col]) or str(row[col]) == 'N/A' for col in numeric_columns) + # At least one non-'N/A' value in the mentioned columns if not all_na: + + # low quality rows are not considered if 'LowQual' not in row[filter_column]: + # Cutoff quality is set to 20 (Phred-scaled) if row[gq_column] >= 20: results.append((index, "quality"))