Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-1067] add functionality to warn for identical ref and tsa2 #553

Merged
merged 8 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

def _check_tsa1_tsa2(df):
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
"""If maf file has both TSA1 and TSA2,
TSA1 must equal REF, or TSA1 must equal TSA2.
TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
"""
tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
Expand All @@ -29,6 +29,9 @@ def _check_tsa1_tsa2(df):
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)
if tsa2_col_exist and ref_col_exist and not df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').empty:
error =(f"{error}REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. "
f"Please check row: {', '.join(str(e+1) for e in df.query('REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2').index.values)}.\n")
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
return error


Expand Down
35 changes: 34 additions & 1 deletion tests/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_firstcolumn_validation(maf_class):
"N_DEPTH": [1, 2, 3, 4, 3],
"N_REF_COUNT": [1, 2, 3, 4, 3],
"N_ALT_COUNT": [1, 2, 3, 4, 3],
"TUMOR_SEQ_ALLELE2": ["A", "A", "A", "A", "A"],
"TUMOR_SEQ_ALLELE2": ["T", "A", "A", "A", "A"],
}
)
order = [
Expand Down Expand Up @@ -258,6 +258,39 @@ def test_invalid__check_tsa1_tsa2():
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
)

def test_invalid__check_ref_tsa2():
danlu1 marked this conversation as resolved.
Show resolved Hide resolved
"""Test the scenario in which maf file has identical REF and tsa2 and fails"""
df = pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["A", "A", "A"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
)
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == (
"REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. "
"Please check row: 1.\n"
)

def test_invalid__check_ref_tsa1_tsa2():
"""Test the scenario in which maf file has TSA1 and TSA2 and fails"""
df = pd.DataFrame(
dict(
REFERENCE_ALLELE=["A", "A", "A"],
TUMOR_SEQ_ALLELE1=["B", "B", "B"],
TUMOR_SEQ_ALLELE2=["A", "C", "C"],
)
)
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == (
"maf: Contains both "
"TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
"All values in TUMOR_SEQ_ALLELE1 must match all values in "
"REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
"REFERENCE_ALLELE should not equal to TUMOR_SEQ_ALLELE2. "
"Please check row: 1.\n"
)

@pytest.mark.parametrize(
"df",
Expand Down
Loading