Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-1067] add functionality to warn for identical ref and tsa2 #553

Merged
merged 8 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
logger = logging.getLogger(__name__)


def _check_tsa1_tsa2(df):
def _check_allele_col_validity(df):
"""If maf file has both TSA1 and TSA2,
TSA1 must equal REF, or TSA1 must equal TSA2.
TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
"""
tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
Expand All @@ -29,6 +29,16 @@ def _check_tsa1_tsa2(df):
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)
if (
tsa2_col_exist
and ref_col_exist
and not df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").empty
):
error = (
f"{error}maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n"
)
row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values
return error


Expand Down Expand Up @@ -260,7 +270,7 @@ def _validate(self, mutationDF):
# "start with 'chr' or any 'WT' values.\n"
# )

error = _check_tsa1_tsa2(mutationDF)
error = _check_allele_col_validity(mutationDF)
total_error.write(error)

if process_functions.checkColExist(mutationDF, "TUMOR_SAMPLE_BARCODE"):
Expand Down
152 changes: 115 additions & 37 deletions tests/test_maf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from cmath import nan
from unittest.mock import mock_open, patch

import pandas as pd
Expand Down Expand Up @@ -81,7 +82,7 @@ def test_firstcolumn_validation(maf_class):
"N_DEPTH": [1, 2, 3, 4, 3],
"N_REF_COUNT": [1, 2, 3, 4, 3],
"N_ALT_COUNT": [1, 2, 3, 4, 3],
"TUMOR_SEQ_ALLELE2": ["A", "A", "A", "A", "A"],
"TUMOR_SEQ_ALLELE2": ["T", "A", "A", "A", "A"],
}
)
order = [
Expand Down Expand Up @@ -196,7 +197,7 @@ def test_invalid_validation(maf_class):
)

with patch.object(
genie_registry.maf, "_check_tsa1_tsa2", return_value=""
genie_registry.maf, "_check_allele_col_validity", return_value=""
) as check_tsa1_tsa2:
error, warning = maf_class._validate(mafDf)
check_tsa1_tsa2.assert_called_once_with(mafDf)
Expand Down Expand Up @@ -241,47 +242,124 @@ def test_error__check_allele_col():
assert warning == ""


def test_invalid__check_tsa1_tsa2():
"""Test the scenario in which maf file has TSA1 and TSA2 and fails"""
df = pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
)
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == (
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)


@pytest.mark.parametrize(
"df",
"test_df,expected_error",
[
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
),
"",
),
pd.DataFrame(
dict(
REFERENCE_ALLELE=["C", "C", "C"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["A", "A", "A"],
)
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["C", "C", "C"],
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
TUMOR_SEQ_ALLELE2=["A", "A", "A"],
)
),
"",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
),
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
),
"maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
),
"maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["C", "C", "C"],
)
),
"",
),
(
pd.DataFrame(
dict(
TUMOR_SEQ_ALLELE1=["C", "C", "C"],
)
),
"",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
),
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
"maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
"This is invalid. Please correct.\n",
),
(
pd.DataFrame(
dict(
REFERENCE_ALLELE=[nan, "A", "A"],
TUMOR_SEQ_ALLELE1=["B", nan, "B"],
TUMOR_SEQ_ALLELE2=[nan, "C", "C"],
)
),
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n",
),
],
ids=[
"matching_tsa1_tsa2",
"matching_tsa1_ref",
"invalid_tsa1",
"identical_ref_tsa2",
"identical_ref_tsa2_missing_tsa1",
"valid_ref_tsa2_missing_tsa1",
"missing_tsa2_ref",
"invalid_tsa1_identical_ref_tsa2",
"NAs_in_allele_cole",
],
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
)
def test_valid__check_tsa1_tsa2(df):
"""Test valid TSA1 and TSA2"""
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == ""
def test__check_allele_col_validity(test_df, expected_error):
error = genie_registry.maf._check_allele_col_validity(test_df)
assert error == expected_error


def test_that__cross_validate_does_not_read_files_if_no_clinical_files(maf_class):
Expand Down
Loading