comments

Hansi-Thewarapperuma · Jan 23, 2024 · a4f892f · a4f892f
1 parent 824ce50
commit a4f892f
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 2 deletions.
diff --git a/Clin_sig.py b/Clin_sig.py
@@ -1,5 +1,5 @@
 '''
-Function: Evaluation through clinical significance criterion
+Function: Evaluation through clinical significance criterion (Variant interpretation in clincal context)
 Input: Dataframe
 Output: A list of tuples containing index and evaluation (clinically_significant, not_clinically_significant, unresolved_clinica_significance, N/A)
 
@@ -14,23 +14,33 @@ def evaluate_clinical_significance(input_variants_df):
     clnsig_column = 'CLNSIG'
     dbnsfp_clnsig_column = 'dbNSFP_clinvar_clnsig'
 
+    # allocate customised scores for expected variant interpretations based on priority
     pathogenic_score = 1
     vus_score = 0.1
     benign_score = 0.01
 
+    # initiate an empty list to store results
     results = []
 
+    # iterate through each row of the dataframe
     for index, row in input_variants_df.iterrows():
+
+        # initially each column values were assigned to 0
         clnsig_score, dbnsfp_score = 0, 0
 
+        # FOR clnsig_column
+        # exclude missing values
         if str(row[clnsig_column]).lower() != 'n/a':
+
+            # when the condition is true; the clnsig_score is assigned with corresponding variant interpretation score
             if 'pathogenic' in str(row[clnsig_column]).lower():
                 clnsig_score = pathogenic_score
             elif 'benign' in str(row[clnsig_column]).lower():
                 clnsig_score = benign_score
             elif 'uncertain_significance' in str(row[clnsig_column]).lower():
                 clnsig_score = vus_score
 
+        # FOR dbnsfp_column
         if str(row[dbnsfp_clnsig_column]).lower() != 'n/a':
             if 'pathogenic' in str(row[dbnsfp_clnsig_column]).lower():
                 dbnsfp_score = pathogenic_score

diff --git a/Cons_score.py b/Cons_score.py
@@ -48,6 +48,10 @@ def conservation_scores(input_variants_df):
             false_count = sum(value is False for value in conserved_count)
             na_count = len(conserved_count) - true_count - false_count
 
+            # TRUE implies the condition is satisfied, i.e. conserved
+            # FALSE implies the condition is satisfied, i.e. not_conserved
+            # [if pd.notna..] was included due to the avaiability of N/A (seperate na_count was taken)
+
             # ************* ADOPTING MAJORITY VOTING ALGORITHM *****************
 
             # Use simple if-else logic to determine conservation status

diff --git a/Func_pred.py b/Func_pred.py
@@ -18,15 +18,20 @@ def in_silico_functional_predictions(input_variants_df):
     cadd_column = 'dbNSFP_CADD_phred_hg19'
     fathmm_column = 'dbNSFP_FATHMM_pred'
 
+    # create a list including the above-mentioned columns
     func_pred_columns = [polyphen_column, sift_column, mutationtaster_column, provean_column, cadd_column,
                          fathmm_column]
 
+    # initiate an empty list to store results
     results = []
 
+    # Iterate through each row of the dataframe
     for index, row in input_variants_df.iterrows():
+
         # Check for N/A values
         na_count = row[func_pred_columns].isna().sum()
 
+        # when all the column values are not 'N/A' ; check the conditions to determine the deleteriousness
         if na_count < len(func_pred_columns):  # If there is at least one non-N/A value
             func_pred_count = [
                 'D' in str(row[polyphen_column]) or 'P' in str(row[polyphen_column]) if pd.notna(
@@ -43,6 +48,11 @@ def in_silico_functional_predictions(input_variants_df):
             false_count = sum(value is False for value in func_pred_count)
             na_count = len(func_pred_count) - true_count - false_count
 
+            # TRUE implies the condition is satisfied, i.e. deleterious
+            # FALSE implies the condition is satisfied, i.e. not_deleterious
+            # [if pd.notna..] was included due to the avaiability of N/A (seperate na_count was taken)
+
+
             # ************* ADOPTING MAJORITY VOTING ALGORITHM *****************
 
             # Use simple if-else logic to determine conservation status

diff --git a/Gen_context.py b/Gen_context.py
@@ -10,15 +10,23 @@
 import pandas as pd
 
 def evaluate_genomic_context(input_variants_df):
+
+    # define the relevant columns
     impact_column = 'ANN[0].IMPACT'
     ontology_terms_column = 'ANN[0].ANNOTATION'
 
+    # initiate an empty list to store results
     results = []
 
+    # Iterate through each row of the dataframe
     for index, row in input_variants_df.iterrows():
+
         # Check if the mentioned columns have non-null values
         if pd.notna(row[impact_column]) and pd.notna(row[ontology_terms_column]):
+
+            # consider onlu high and moderate putative imapact ( neither LOW nor MODIFIER )
             if 'HIGH' in row[impact_column] or 'MODERATE' in row[impact_column]:
+
                 # Check if 'ANN[0].ANNOTATION' is not one of the specified values
                 if row[ontology_terms_column] not in ['synonymous_variant', 'intergenic_region', 'intron_variant',
                                                      'intragenic_variant', 'start_retained_variant', 'stop_retained_variant',

diff --git a/MAF.py b/MAF.py
@@ -16,6 +16,7 @@ def evaluate_minor_allele_freq(input_variants_df):
                                                'AF_nfe_onf', 'AF_eas_oea', 'AF_nfe_nwe', 'AF_nfe_seu', 'AF_nfe_swe',
                                                'AF_eas_jpn', 'AF_eas_kor', 'AF_fin', 'AF_asj', 'AF_nfe_est', 'AF_oth']]
 
+    # initiate an empty list to store final results
     results = []
 
     # Iterate through each row of the dataframe
@@ -30,7 +31,7 @@ def evaluate_minor_allele_freq(input_variants_df):
             # Convert non-missing values to numerics
             numeric_values = [pd.to_numeric(value, errors='coerce') for value in row]
 
-            #*********** IMPLEMENTATION OF SIMPLE ENSEMBLE LEARNING METHOD USING AVERAGE ***********
+            #*********** IMPLEMENTATION OF ENSEMBLE AVERAGING METHOD ***********
 
             # Count occurrences of unique values for the row
             value_counts = pd.Series(numeric_values).value_counts()

diff --git a/Quality.py b/Quality.py
@@ -2,6 +2,7 @@
 
 def evaluate_quality(input_variants_df):
 
+    # define relevant columns
     filter_column = 'FILTER'
     gq_column = 'GEN[*].GQ'
     qual_column = "QUAL"
@@ -12,14 +13,21 @@ def evaluate_quality(input_variants_df):
     numeric_columns = [gq_column, qual_column, ad_column, dp_column]
     input_variants_df[numeric_columns] = input_variants_df[numeric_columns].apply(pd.to_numeric, errors='coerce')
 
+    # initiate an empty list to store results
     results = []
 
+    # Iterate through each row of the dataframe
     for index, row in input_variants_df.iterrows():
+
         # Check if all the mentioned columns have 'N/A' values
         all_na = all(pd.isna(row[col]) or str(row[col]) == 'N/A' for col in numeric_columns)
 
+        # At least one non-'N/A' value in the mentioned columns
         if not all_na:
+
+            # low quality rows are not considered
             if 'LowQual' not in row[filter_column]:
+
                 # Cutoff quality is set to 20 (Phred-scaled)
                 if row[gq_column] >= 20:
                     results.append((index, "quality"))