From c430754a62051a511aa80805d9098f5a2b383063 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 8 Nov 2023 16:27:54 -0800 Subject: [PATCH] [GEN-809] Validate allele columns (#539) * add code for allele validation - initial * revamp validation msg and function for combination and individually used allele values * remove NA check in _check_allele_col * add allele validation special handling for all nas and non-str cols --- genie/validate.py | 94 ++++++++++++++++++ genie_registry/maf.py | 32 +++++-- genie_registry/vcf.py | 23 +++++ tests/test_maf.py | 46 ++++----- tests/test_validate.py | 212 +++++++++++++++++++++++++++++++++++++++++ tests/test_vcf.py | 22 +++-- 6 files changed, 390 insertions(+), 39 deletions(-) diff --git a/genie/validate.py b/genie/validate.py index 35534bdf..5bf0f17a 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import re import logging from typing import Dict, List, Optional @@ -415,3 +416,96 @@ def standardize_string_for_validation( return standardized_str else: return input_string + + +def get_invalid_allele_rows( + input_data: pd.DataFrame, + input_col: str, + allowed_comb_alleles: list, + allowed_ind_alleles: list, + ignore_case: bool = False, + allow_na: bool = False, +) -> pd.Index: + """ + Find invalid indices in a DataFrame column based on allowed allele values. + + Args: + input_data (pd.DataFrame): The DataFrame to search. + input_col (str): The name of the column to check. + allowed_comb_alleles (list): The list of allowed allele values + (can appear in combinations or individually) + allowed_ind_alleles (list): The list of allowed allele values + (can only appear individually) + ignore_case (bool, optional): whether to perform case-insensitive matching + allow_na (bool, optional): whether to allow NAs to be an allowed allele + value or not. + Returns: + pd.Index: A pandas index object indicating the row indices that + don't match the allowed alleles + """ + search_str = "" + if allowed_comb_alleles: + search_str += f'^[{re.escape("".join(allowed_comb_alleles))}]+$' + + if allowed_ind_alleles: + search_str += f'|^[{re.escape("".join(allowed_ind_alleles))}]+$' + + if ignore_case: + flags = re.IGNORECASE + else: + flags = 0 # no flags + + # special handling for all NA column + is_all_na = pd.isna(input_data[input_col]).all() + if is_all_na and allow_na: + invalid_indices = pd.Index([]) + elif is_all_na and not allow_na: + invalid_indices = input_data.index + else: + # convert numeric cols to string while preserving NAs in order to use str.match + transformed_data = input_data.copy() + transformed_data[input_col] = transform._convert_col_with_nas_to_str( + transformed_data, input_col + ) + + matching_indices = transformed_data[input_col].str.match( + search_str, flags=flags, na=allow_na + ) + invalid_indices = transformed_data[~matching_indices].index + return invalid_indices + + +def get_allele_validation_message( + invalid_indices: pd.Series, + invalid_col: str, + allowed_comb_alleles: list, + allowed_ind_alleles: list, + fileformat: str, +) -> tuple: + """Creates the error/warning message for the check for invalid alleles + + Args: + invalid_indices (pd.Series): the row indices that + have invalid alleles + invalid_col (str): The column with the invalid values + allowed_comb_alleles (list): The list of allowed allele values + (can appear in combinations or individually) + allowed_ind_alleles (list): The list of allowed allele values + (can only appear individually) + fileformat (str): Name of the fileformat + + Returns: + tuple: The errors and warnings from the allele validation + Defaults to blank strings + """ + errors = "" + warnings = "" + if len(invalid_indices) > 0: + errors = ( + f"{fileformat}: Your {invalid_col} column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + f"or in combination with each other: {','.join(allowed_comb_alleles)}.\n" + "This is the list of accepted allele values that can only appear individually: " + f"{','.join(allowed_ind_alleles)}\n" + ) + return errors, warnings diff --git a/genie_registry/maf.py b/genie_registry/maf.py index ab8f9193..3adcc2ce 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -47,14 +47,6 @@ def _check_allele_col(df, col): error = "" warning = "" if col_exist: - # CHECK: The value "NA" can't be used as a placeholder - if sum(df[col].fillna("") == "NA") > 0: - warning = ( - "maf: " - f"{col} column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" - ) # CHECK: There can't be any null values if sum(df[col].isnull()) > 0: error = f"maf: {col} can't have any blank or null values.\n" @@ -70,6 +62,9 @@ class maf(FileTypeFormat): _fileType = "maf" _process_kwargs = [] + _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] + _allowed_comb_alleles = ["A", "T", "C", "G", "N"] + _allowed_ind_alleles = ["-"] def _validateFilename(self, filePath): """ @@ -294,6 +289,27 @@ def _validate(self, mutationDF): ) total_error.write(errors) warning.write(warnings) + + for allele_col in self._allele_cols: + if process_functions.checkColExist(mutationDF, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + mutationDF, + allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, + ignore_case=True, + allow_na=False, + ) + errors, warnings = validate.get_allele_validation_message( + invalid_indices, + invalid_col=allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, + fileformat=self._fileType, + ) + total_error.write(errors) + warning.write(warnings) + return total_error.getvalue(), warning.getvalue() def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index 71ad86a4..cf381086 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -18,6 +18,9 @@ class vcf(FileTypeFormat): _fileType = "vcf" _process_kwargs = [] + _allele_cols = ["REF"] + _allowed_comb_alleles = ["A", "T", "C", "G", "N"] + _allowed_ind_alleles = [] def _validateFilename(self, filePath): basename = os.path.basename(filePath[0]) @@ -137,6 +140,26 @@ def _validate(self, vcfdf): total_error += error warning += warn + for allele_col in self._allele_cols: + if process_functions.checkColExist(vcfdf, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + vcfdf, + input_col=allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, + ignore_case=True, + allow_na=False, + ) + errors, warnings = validate.get_allele_validation_message( + invalid_indices, + invalid_col=allele_col, + allowed_comb_alleles=self._allowed_comb_alleles, + allowed_ind_alleles=self._allowed_ind_alleles, + fileformat=self._fileType, + ) + total_error += errors + warning += warnings + # No white spaces white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1) if sum(white_space) > 0: diff --git a/tests/test_maf.py b/tests/test_maf.py index ef07d54d..71d61e64 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -19,7 +19,13 @@ def valid_maf_df(): dict( CHROMOSOME=[1, 2, 3, 4, 5], START_POSITION=[1, 2, 3, 4, 2], - REFERENCE_ALLELE=["A", "A", "A", "A", "A"], + REFERENCE_ALLELE=[ + "C", + "G", + "NA", + "-", + "TAAAGATCGTACAGAA", + ], TUMOR_SAMPLE_BARCODE=[ "GENIE-SAGE-ID1-1", "GENIE-SAGE-ID1-1", @@ -94,6 +100,10 @@ def test_firstcolumn_validation(maf_class): "maf: First column header must be " "one of these: CHROMOSOME, HUGO_SYMBOL, " "TUMOR_SAMPLE_BARCODE.\n" + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + f"or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ) assert error == expectedErrors assert warning == "" @@ -147,16 +157,20 @@ def test_errors_validation(maf_class): "This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, " "10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" + "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ) expectedWarnings = ( "maf: " "Does not have the column headers that can give " "extra information to the processed maf: " "T_REF_COUNT, N_DEPTH.\n" - "maf: " - "REFERENCE_ALLELE column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" ) assert error == expectedErrors @@ -195,11 +209,12 @@ def test_invalid_validation(maf_class): "maf: " "TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" + "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: -\n" ) expectedWarnings = ( - "maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" "maf: Does not have the column headers that can give " "extra information to the processed maf: T_REF_COUNT.\n" ) @@ -210,25 +225,12 @@ def test_invalid_validation(maf_class): @pytest.mark.parametrize("col", ["temp", "REFERENCE_ALLELE"]) def test_noerror__check_allele_col(col): """Test error and warning is an empty string if REF col isn't passed in""" - df = pd.DataFrame(dict(REFERENCE_ALLELE=["A", "A"])) + df = pd.DataFrame(dict(REFERENCE_ALLELE=["NA", "A"])) error, warning = genie_registry.maf._check_allele_col(df, col) assert error == "" assert warning == "" -def test_warning__check_allele_col(): - """Test warning occurs when 'NA' string is passed in""" - df = pd.DataFrame(dict(TEMP=["NA", "A"])) - error, warning = genie_registry.maf._check_allele_col(df, "TEMP") - assert error == "" - assert warning == ( - "maf: " - "TEMP column contains 'NA' values, " - "which cannot be placeholders for blank values. " - "Please put in empty strings for blank values.\n" - ) - - def test_error__check_allele_col(): """Test error occurs when blank allele is passed in""" df = pd.DataFrame(dict(TEMP=[float("nan"), "A"])) diff --git a/tests/test_validate.py b/tests/test_validate.py index b94a4ac3..4e16cfee 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -768,3 +768,215 @@ def test_that_standardize_string_for_validation_returns_expected( allow_underscore=allow_underscore, ) assert test_str == expected + + +def get_invalid_allele_rows_test_cases(): + return [ + { + "name": "correct_alleles", + "input": pd.DataFrame( + { + "REFERENCE_ALLELE": [ + "NANANANA", + "ACGTN", + "A", + "C", + "T", + "G", + "-", + "N", + ] + } + ), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "N"], + "allowed_ind_alleles": ["-"], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "incorrect_alleles", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX", "XXX"]}), + "expected_index": pd.Index([0, 1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "case_ignored", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["acgtg", "acgt", "-", "a"]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": ["-"], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "case_not_ignored", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}), + "expected_index": pd.Index([0]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": False, + "allow_na": True, + }, + { + "name": "no_ind_alleles_incorrect", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACG-T", "ACGT", "G-CT"]}), + "expected_index": pd.Index([0, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "no_ind_alleles_correct", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACT", "ACGT", "G"]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "missing_entries_not_allowed", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), + "expected_index": pd.Index([1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, + { + "name": "missing_entries_allowed", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "no_specified_alleles_values", + "input": pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", "ACGF", "B"]}), + "expected_index": pd.Index([]), + "allowed_comb_alleles": [], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "float_nas_not_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]} + ), + "expected_index": pd.Index([0, 1, 2, 3, 4]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, + { + "name": "float_nas_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]} + ), + "expected_index": pd.Index([0, 1, 3, 4]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "all_missing_nas_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]} + ), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "all_missing_nas_not_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]} + ), + "expected_index": pd.Index([0, 1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get_invalid_allele_rows_test_cases(), ids=lambda x: x["name"] +) +def test_that_get_invalid_allele_rows_returns_expected(test_cases): + invalid_rows = validate.get_invalid_allele_rows( + test_cases["input"], + input_col="REFERENCE_ALLELE", + allowed_comb_alleles=test_cases["allowed_comb_alleles"], + allowed_ind_alleles=test_cases["allowed_ind_alleles"], + ignore_case=test_cases["ignore_case"], + allow_na=test_cases["allow_na"], + ) + assert invalid_rows.equals(test_cases["expected_index"]) + + +def get_allele_validation_message_test_cases(): + return [ + { + "name": "has_invalid_alleles", + "input_invalid_rows": pd.Index([1, 2, 3]), + "allowed_comb_alleles": ["A", "C", "T", "G"], + "allowed_ind_alleles": ["-"], + "expected_error": ( + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,C,T,G.\n" + "This is the list of accepted allele values that can only appear individually: -\n" + ), + "expected_warning": "", + }, + { + "name": "has_no_invalid_alleles", + "input_invalid_rows": [], + "allowed_comb_alleles": [], + "allowed_ind_alleles": [], + "expected_error": "", + "expected_warning": "", + }, + { + "name": "has_invalid_alleles_empty_ind_alleles", + "input_invalid_rows": pd.Index([1, 2, 3]), + "allowed_comb_alleles": ["A", "C", "T", "G"], + "allowed_ind_alleles": [], + "expected_error": ( + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,C,T,G.\n" + "This is the list of accepted allele values that can only appear individually: \n" + ), + "expected_warning": "", + }, + ] + + +@pytest.mark.parametrize( + "test_cases", get_allele_validation_message_test_cases(), ids=lambda x: x["name"] +) +def test_that_get_allele_validation_message_returns_expected(test_cases): + error, warning = validate.get_allele_validation_message( + invalid_indices=test_cases["input_invalid_rows"], + invalid_col="REFERENCE_ALLELE", + allowed_comb_alleles=test_cases["allowed_comb_alleles"], + allowed_ind_alleles=test_cases["allowed_ind_alleles"], + fileformat="maf", + ) + assert error == test_cases["expected_error"] + assert warning == test_cases["expected_warning"] diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 1c78d87f..d8bbabf5 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -29,7 +29,7 @@ def test_validation_valid_no_samples(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -47,7 +47,7 @@ def test_validation_valid_one_sample_tumor(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -67,7 +67,7 @@ def test_validation_valid_one_sample(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -88,7 +88,7 @@ def test_validation_missing_format_col(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -107,7 +107,7 @@ def test_validation_invalid_one_sample(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -130,7 +130,7 @@ def test_validation_valid_two_samples(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -151,7 +151,7 @@ def test_validation_invalid_two_samples_tumor(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -172,7 +172,7 @@ def test_validation_invalid_two_samples_normal(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -193,7 +193,7 @@ def test_validation_invalid_white_space(vcf_class): "#CHROMM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AA ED1", "AAAS"], @@ -231,6 +231,10 @@ def test_validation_invalid_content(vcf_class): "space delimited instead of tab delimited.\n" "vcf: Please double check your #CHROM column. This column must only be these values: " "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" + "vcf: Your REF column has invalid allele values. " + "This is the list of accepted allele values that can appear individually " + "or in combination with each other: A,T,C,G,N.\n" + "This is the list of accepted allele values that can only appear individually: \n" ) expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n" assert error == expectedError