Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-809] Validate allele columns #539

Merged
merged 9 commits into from
Nov 9, 2023
94 changes: 94 additions & 0 deletions genie/validate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
import re
import logging
from typing import Dict, List, Optional

Expand Down Expand Up @@ -415,3 +416,96 @@ def standardize_string_for_validation(
return standardized_str
else:
return input_string


def get_invalid_allele_rows(
input_data: pd.DataFrame,
input_col: str,
allowed_comb_alleles: list,
allowed_ind_alleles: list,
ignore_case: bool = False,
allow_na: bool = False,
) -> pd.Index:
"""
Find invalid indices in a DataFrame column based on allowed allele values.

Args:
input_data (pd.DataFrame): The DataFrame to search.
input_col (str): The name of the column to check.
allowed_comb_alleles (list): The list of allowed allele values
(can appear in combinations or individually)
allowed_ind_alleles (list): The list of allowed allele values
(can only appear individually)
ignore_case (bool, optional): whether to perform case-insensitive matching
allow_na (bool, optional): whether to allow NAs to be an allowed allele
value or not.
Returns:
pd.Index: A pandas index object indicating the row indices that
don't match the allowed alleles
"""
search_str = ""
if allowed_comb_alleles:
search_str += f'^[{re.escape("".join(allowed_comb_alleles))}]+$'

if allowed_ind_alleles:
search_str += f'|^[{re.escape("".join(allowed_ind_alleles))}]+$'

if ignore_case:
flags = re.IGNORECASE
else:
flags = 0 # no flags

# special handling for all NA column
is_all_na = pd.isna(input_data[input_col]).all()
if is_all_na and allow_na:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not the biggest fan of what has to be done here to allow the checking of non-str columns and columns with all NAs. I almost feel like it would be better to have the _check_allele_col be an enforced rule where it skips certain validation rules if the column has any NAs OR if we choose to allow NAs, have a standardized handling for those (if that's even possible...) so that we don't have to do it in the individual validation functions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, can you create a ticket to track that? for now, let's do the 80/20 rule. This does exactly what we need it to do.

invalid_indices = pd.Index([])
elif is_all_na and not allow_na:
invalid_indices = input_data.index
else:
# convert numeric cols to string while preserving NAs in order to use str.match
transformed_data = input_data.copy()
transformed_data[input_col] = transform._convert_col_with_nas_to_str(
transformed_data, input_col
)

matching_indices = transformed_data[input_col].str.match(
search_str, flags=flags, na=allow_na
)
invalid_indices = transformed_data[~matching_indices].index
return invalid_indices


def get_allele_validation_message(
invalid_indices: pd.Series,
invalid_col: str,
allowed_comb_alleles: list,
allowed_ind_alleles: list,
fileformat: str,
) -> tuple:
"""Creates the error/warning message for the check for invalid alleles

Args:
invalid_indices (pd.Series): the row indices that
have invalid alleles
invalid_col (str): The column with the invalid values
allowed_comb_alleles (list): The list of allowed allele values
(can appear in combinations or individually)
allowed_ind_alleles (list): The list of allowed allele values
(can only appear individually)
fileformat (str): Name of the fileformat

Returns:
tuple: The errors and warnings from the allele validation
Defaults to blank strings
"""
errors = ""
warnings = ""
if len(invalid_indices) > 0:
errors = (
f"{fileformat}: Your {invalid_col} column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
f"or in combination with each other: {','.join(allowed_comb_alleles)}.\n"
"This is the list of accepted allele values that can only appear individually: "
f"{','.join(allowed_ind_alleles)}\n"
)
return errors, warnings
32 changes: 24 additions & 8 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,6 @@ def _check_allele_col(df, col):
error = ""
warning = ""
if col_exist:
# CHECK: The value "NA" can't be used as a placeholder
if sum(df[col].fillna("") == "NA") > 0:
warning = (
"maf: "
f"{col} column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
)
# CHECK: There can't be any null values
if sum(df[col].isnull()) > 0:
error = f"maf: {col} can't have any blank or null values.\n"
Expand All @@ -70,6 +62,9 @@ class maf(FileTypeFormat):
_fileType = "maf"

_process_kwargs = []
_allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
_allowed_comb_alleles = ["A", "T", "C", "G", "N"]
_allowed_ind_alleles = ["-"]

def _validateFilename(self, filePath):
"""
Expand Down Expand Up @@ -294,6 +289,27 @@ def _validate(self, mutationDF):
)
total_error.write(errors)
warning.write(warnings)

for allele_col in self._allele_cols:
if process_functions.checkColExist(mutationDF, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
mutationDF,
allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
ignore_case=True,
allow_na=False,
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
fileformat=self._fileType,
)
total_error.write(errors)
warning.write(warnings)

return total_error.getvalue(), warning.getvalue()

def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
Expand Down
23 changes: 23 additions & 0 deletions genie_registry/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ class vcf(FileTypeFormat):
_fileType = "vcf"

_process_kwargs = []
_allele_cols = ["REF"]
_allowed_comb_alleles = ["A", "T", "C", "G", "N"]
_allowed_ind_alleles = []

def _validateFilename(self, filePath):
basename = os.path.basename(filePath[0])
Expand Down Expand Up @@ -137,6 +140,26 @@ def _validate(self, vcfdf):
total_error += error
warning += warn

for allele_col in self._allele_cols:
if process_functions.checkColExist(vcfdf, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
vcfdf,
input_col=allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
ignore_case=True,
allow_na=False,
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=allele_col,
allowed_comb_alleles=self._allowed_comb_alleles,
allowed_ind_alleles=self._allowed_ind_alleles,
fileformat=self._fileType,
)
total_error += errors
warning += warnings

# No white spaces
white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
if sum(white_space) > 0:
Expand Down
46 changes: 24 additions & 22 deletions tests/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ def valid_maf_df():
dict(
CHROMOSOME=[1, 2, 3, 4, 5],
START_POSITION=[1, 2, 3, 4, 2],
REFERENCE_ALLELE=["A", "A", "A", "A", "A"],
REFERENCE_ALLELE=[
"C",
"G",
"NA",
"-",
"TAAAGATCGTACAGAA",
],
TUMOR_SAMPLE_BARCODE=[
"GENIE-SAGE-ID1-1",
"GENIE-SAGE-ID1-1",
Expand Down Expand Up @@ -94,6 +100,10 @@ def test_firstcolumn_validation(maf_class):
"maf: First column header must be "
"one of these: CHROMOSOME, HUGO_SYMBOL, "
"TUMOR_SAMPLE_BARCODE.\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
f"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
)
assert error == expectedErrors
assert warning == ""
Expand Down Expand Up @@ -147,16 +157,20 @@ def test_errors_validation(maf_class):
"This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, "
"10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
)
expectedWarnings = (
"maf: "
"Does not have the column headers that can give "
"extra information to the processed maf: "
"T_REF_COUNT, N_DEPTH.\n"
"maf: "
"REFERENCE_ALLELE column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
)

assert error == expectedErrors
Expand Down Expand Up @@ -195,11 +209,12 @@ def test_invalid_validation(maf_class):
"maf: "
"TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"This is the list of accepted allele values that can appear individually "
"or in combination with each other: A,T,C,G,N.\n"
"This is the list of accepted allele values that can only appear individually: -\n"
)
expectedWarnings = (
"maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
"maf: Does not have the column headers that can give "
"extra information to the processed maf: T_REF_COUNT.\n"
)
Expand All @@ -210,25 +225,12 @@ def test_invalid_validation(maf_class):
@pytest.mark.parametrize("col", ["temp", "REFERENCE_ALLELE"])
def test_noerror__check_allele_col(col):
"""Test error and warning is an empty string if REF col isn't passed in"""
df = pd.DataFrame(dict(REFERENCE_ALLELE=["A", "A"]))
df = pd.DataFrame(dict(REFERENCE_ALLELE=["NA", "A"]))
error, warning = genie_registry.maf._check_allele_col(df, col)
assert error == ""
assert warning == ""


def test_warning__check_allele_col():
"""Test warning occurs when 'NA' string is passed in"""
df = pd.DataFrame(dict(TEMP=["NA", "A"]))
error, warning = genie_registry.maf._check_allele_col(df, "TEMP")
assert error == ""
assert warning == (
"maf: "
"TEMP column contains 'NA' values, "
"which cannot be placeholders for blank values. "
"Please put in empty strings for blank values.\n"
)


def test_error__check_allele_col():
"""Test error occurs when blank allele is passed in"""
df = pd.DataFrame(dict(TEMP=[float("nan"), "A"]))
Expand Down
Loading
Loading