Skip to content

Commit

Permalink
[GEN-809] Validate allele columns (#539)
Browse files Browse the repository at this point in the history
* add code for allele validation - initial
* revamp validation msg and function for combination and individually used allele values
* remove NA check in _check_allele_col
* add allele validation special handling for all nas and non-str cols
  • Loading branch information
rxu17 authored Nov 9, 2023
1 parent e2764b2 commit c430754
Show file tree
Hide file tree
Showing 6 changed files with 390 additions and 39 deletions.
94 changes: 94 additions & 0 deletions genie/validate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
import re
import logging
from typing import Dict, List, Optional

Expand Down Expand Up @@ -415,3 +416,96 @@ def standardize_string_for_validation(
return standardized_str
else:
return input_string


def get_invalid_allele_rows(
input_data: pd.DataFrame,
input_col: str,
allowed_comb_alleles: list,
allowed_ind_alleles: list,
ignore_case: bool = False,
allow_na: bool = False,
) -> pd.Index:
"""
Find invalid indices in a DataFrame column based on allowed allele values.
Args:
input_data (pd.DataFrame): The DataFrame to search.
input_col (str): The name of the column to check.
allowed_comb_alleles (list): The list of allowed allele values
(can appear in combinations or individually)
allowed_ind_alleles (list): The list of allowed allele values
(can only appear individually)
ignore_case (bool, optional): whether to perform case-insensitive matching
allow_na (bool, optional): whether to allow NAs to be an allowed allele
value or not.
Returns:
pd.Index: A pandas index object indicating the row indices that
don't match the allowed alleles
"""
search_str = ""
if allowed_comb_alleles:
search_str += f'^[{re.escape("".join(allowed_comb_alleles))}]+$'

if allowed_ind_alleles:
search_str += f'|^[{re.escape("".join(allowed_ind_alleles))}]+$'

if ignore_case:
flags = re.IGNORECASE
else:
flags = 0 # no flags

# special handling for all NA column
is_all_na = pd.isna(input_data[input_col]).all()
if is_all_na and allow_na:
invalid_indices = pd.Index([])
elif is_all_na and not allow_na:
invalid_indices = input_data.index
else:
# convert numeric cols to string while preserving NAs in order to use str.match
transformed_data = input_data.copy()
transformed_data[input_col] = transform._convert_col_with_nas_to_str(
transformed_data, input_col
)

matching_indices = transformed_data[input_col].str.match(
search_str, flags=flags, na=allow_na
)
invalid_indices = transformed_data[~matching_indices].index
return invalid_indices


def get_allele_validation_message(
invalid_indices: pd.Series,
invalid_col: str,
allowed_comb_alleles: list,
allowed_ind_alleles: list,
fileformat: str,
) -> tuple:
"""Creates the error/warning message for the check for invalid alleles
Args:
invalid_indices (pd.Series): the row indices that
have invalid alleles
invalid_col (str): The column with the invalid values
allowed_comb_alleles (list): The list of allowed allele values
(can appear in combinations or individually)
allowed_ind_alleles (list): The list of allowed allele values
(can only appear individually)
fileformat (str): Name of the fileformat
Returns:
tuple: The errors and warnings from the allele validation
Defaults to blank strings
"""
errors = ""
warnings = ""
if len(invalid_indices) > 0:
errors = (
f"{fileformat}: Your {invalid_col} column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
f"or in combination with each other: {','.join(allowed_comb_alleles)}.\n"
"This is the list of accepted allele values that can only appear individually: "
f"{','.join(allowed_ind_alleles)}\n"
)
return errors, warnings
32 changes: 24 additions & 8 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,6 @@ def _check_allele_col(df, col):
error = ""
warning = ""
if col_exist:
# CHECK: The value "NA" can't be used as a placeholder
if sum(df[col].fillna("") == "NA") > 0:
warning = (
"maf: "
f"{col} column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
)
# CHECK: There can't be any null values
if sum(df[col].isnull()) > 0:
error = f"maf: {col} can't have any blank or null values.\n"
Expand All @@ -70,6 +62,9 @@ class maf(FileTypeFormat):
_fileType = "maf"

_process_kwargs = []
_allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
_allowed_comb_alleles = ["A", "T", "C", "G", "N"]
_allowed_ind_alleles = ["-"]

def _validateFilename(self, filePath):
"""
Expand Down Expand Up @@ -294,6 +289,27 @@ def _validate(self, mutationDF):
)
total_error.write(errors)
warning.write(warnings)

for allele_col in self._allele_cols:
if process_functions.checkColExist(mutationDF, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
mutationDF,
allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
ignore_case=True,
allow_na=False,
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
fileformat=self._fileType,
)
total_error.write(errors)
warning.write(warnings)

return total_error.getvalue(), warning.getvalue()

def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
Expand Down
23 changes: 23 additions & 0 deletions genie_registry/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ class vcf(FileTypeFormat):
_fileType = "vcf"

_process_kwargs = []
_allele_cols = ["REF"]
_allowed_comb_alleles = ["A", "T", "C", "G", "N"]
_allowed_ind_alleles = []

def _validateFilename(self, filePath):
basename = os.path.basename(filePath[0])
Expand Down Expand Up @@ -137,6 +140,26 @@ def _validate(self, vcfdf):
total_error += error
warning += warn

for allele_col in self._allele_cols:
if process_functions.checkColExist(vcfdf, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
vcfdf,
input_col=allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
ignore_case=True,
allow_na=False,
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
fileformat=self._fileType,
)
total_error += errors
warning += warnings

# No white spaces
white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
if sum(white_space) > 0:
Expand Down
46 changes: 24 additions & 22 deletions tests/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ def valid_maf_df():
dict(
CHROMOSOME=[1, 2, 3, 4, 5],
START_POSITION=[1, 2, 3, 4, 2],
REFERENCE_ALLELE=["A", "A", "A", "A", "A"],
REFERENCE_ALLELE=[
"C",
"G",
"NA",
"-",
"TAAAGATCGTACAGAA",
],
TUMOR_SAMPLE_BARCODE=[
"GENIE-SAGE-ID1-1",
"GENIE-SAGE-ID1-1",
Expand Down Expand Up @@ -94,6 +100,10 @@ def test_firstcolumn_validation(maf_class):
"maf: First column header must be "
"one of these: CHROMOSOME, HUGO_SYMBOL, "
"TUMOR_SAMPLE_BARCODE.\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
f"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
)
assert error == expectedErrors
assert warning == ""
Expand Down Expand Up @@ -147,16 +157,20 @@ def test_errors_validation(maf_class):
"This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, "
"10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
)
expectedWarnings = (
"maf: "
"Does not have the column headers that can give "
"extra information to the processed maf: "
"T_REF_COUNT, N_DEPTH.\n"
"maf: "
"REFERENCE_ALLELE column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
)

assert error == expectedErrors
Expand Down Expand Up @@ -195,11 +209,12 @@ def test_invalid_validation(maf_class):
"maf: "
"TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
)
expectedWarnings = (
"maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
"maf: Does not have the column headers that can give "
"extra information to the processed maf: T_REF_COUNT.\n"
)
Expand All @@ -210,25 +225,12 @@ def test_invalid_validation(maf_class):
@pytest.mark.parametrize("col", ["temp", "REFERENCE_ALLELE"])
def test_noerror__check_allele_col(col):
"""Test error and warning is an empty string if REF col isn't passed in"""
df = pd.DataFrame(dict(REFERENCE_ALLELE=["A", "A"]))
df = pd.DataFrame(dict(REFERENCE_ALLELE=["NA", "A"]))
error, warning = genie_registry.maf._check_allele_col(df, col)
assert error == ""
assert warning == ""


def test_warning__check_allele_col():
"""Test warning occurs when 'NA' string is passed in"""
df = pd.DataFrame(dict(TEMP=["NA", "A"]))
error, warning = genie_registry.maf._check_allele_col(df, "TEMP")
assert error == ""
assert warning == (
"maf: "
"TEMP column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
)


def test_error__check_allele_col():
"""Test error occurs when blank allele is passed in"""
df = pd.DataFrame(dict(TEMP=[float("nan"), "A"]))
Expand Down
Loading

0 comments on commit c430754

Please sign in to comment.