From ba4eded875c76b047a8349f75b3b0227e7fed7ad Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Fri, 10 Nov 2023 13:30:55 -0800 Subject: [PATCH] For #2175 validate go id --- ontobio/io/gafparser.py | 9 +++++++- ontobio/io/gpadparser.py | 10 ++++++++- tests/test_gafparser.py | 6 ++++++ tests/test_gpad_parser.py | 44 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 2 deletions(-) diff --git a/ontobio/io/gafparser.py b/ontobio/io/gafparser.py index d2be5037..8064a67c 100644 --- a/ontobio/io/gafparser.py +++ b/ontobio/io/gafparser.py @@ -435,8 +435,15 @@ def to_association(gaf_line: List[str], report=None, group="unknown", dataset="u qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers] object = association.Term(association.Curie.from_str(gaf_line[4]), taxon) - if isinstance(object, association.Error): + if isinstance(object, association.Error) or isinstance(object.id, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1) + return assocparser.ParseResult(source_line, [], True, report=report) + + # Check GO Term namespace and identifier + go_term = object.id + if go_term.namespace != "GO" or go_term.identity.isnumeric == False: + report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Namespace should be \"GO\" and identity a numeric value greater than \"0\"", taxon=gaf_line[TAXON_INDEX], rule=1) + return assocparser.ParseResult(source_line, [], True, report=report) # References references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e] diff --git a/ontobio/io/gpadparser.py b/ontobio/io/gpadparser.py index 2b9881b6..f222fa66 100644 --- a/ontobio/io/gpadparser.py +++ b/ontobio/io/gpadparser.py @@ -305,6 +305,10 @@ def from_1_2(gpad_line: List[str], report=None, group="unknown", dataset="unknow if go_term.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[3], "Problem parsing GO Term", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) + + if go_term.namespace != "GO" or go_term.identity.isnumeric == False: + report.error(source_line, Report.INVALID_SYMBOL, gpad_line[3], "Namespace should be \"GO\" and identity a numeric value greater than \"0\"", taxon=str(taxon), rule=1) + return assocparser.ParseResult(source_line, [], True, report=report) object = association.Term(go_term, taxon) @@ -449,7 +453,11 @@ def from_2_0(gpad_line: List[str], report=None, group="unknown", dataset="unknow if go_term.is_error(): report.error(source_line, Report.INVALID_SYMBOL, gpad_line[ONTOLOGY_CLASS_INDEX], "Problem parsing GO Term", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) - + + if go_term.namespace != "GO" or go_term.identity.isnumeric == False: + report.error(source_line, Report.INVALID_SYMBOL, gpad_line[ONTOLOGY_CLASS_INDEX], "Namespace should be \"GO\" and identity a numeric value greater than \"0\"", taxon=str(taxon), rule=1) + return assocparser.ParseResult(source_line, [], True, report=report) + object = association.Term(go_term, taxon) evidence_type = association.Curie.from_str(gpad_line[EVIDENCE_INDEX]) diff --git a/tests/test_gafparser.py b/tests/test_gafparser.py index f41e3671..ed19cb32 100644 --- a/tests/test_gafparser.py +++ b/tests/test_gafparser.py @@ -420,6 +420,12 @@ def test_bad_date(): assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\tTODAY\tPomBase\tfoo(X:1)") assert assoc_result.skipped == True assert assoc_result.associations == [] + +def test_bad_go_id(): + p = GafParser() + assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tINVALID:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20231110\tPomBase\tfoo(X:1)") + assert assoc_result.skipped == True + assert assoc_result.associations == [] def test_bad_taxon(): p = GafParser() diff --git a/tests/test_gpad_parser.py b/tests/test_gpad_parser.py index 2bba3651..2628795a 100644 --- a/tests/test_gpad_parser.py +++ b/tests/test_gpad_parser.py @@ -124,6 +124,27 @@ def test_parse_interacting_taxon(): ] result = to_association(list(vals), report=report, version="1.2") assert result.associations[0].interacting_taxon == Curie(namespace="NCBITaxon", identity="5678") + +def test_parse_go_id_1_2(): + report = assocparser.Report(group="unknown", dataset="unknown") + vals = [ + "MGI", + "MGI:1918911", + "enables", + "UBERON:1234", + "MGI:MGI:2156816|GO_REF:0000015", + "ECO:0000307", + "", + "", + "20100209", + "MGI", + "", + "creation-date=2020-09-17|modification-date=2020-09-17|contributor-id=http://orcid.org/0000-0003-2689-5511" + ] + result = to_association(list(vals), report=report, version="1.2") + assert result.skipped == 1 + assert len([m for m in result.report.messages if m["level"] == "ERROR"]) == 1 + assert len(result.associations) == 0 def test_duplicate_key_annot_properties(): @@ -189,6 +210,29 @@ def test_parse_2_0(): # Test annotation property retrieval contributors = result.associations[0].annotation_property_values(property_key="contributor-id") assert set(contributors) == {"http://orcid.org/0000-0003-2689-5511"} + + +def test_parse_go_id_2_0(): + version = "2.0" + report = assocparser.Report(group="unknown", dataset="unknown") + vals = [ + "MGI:MGI:1918911", + "", + "RO:0002327", + "UBERON:5678", + "MGI:MGI:2156816|GO_REF:0000015", + "ECO:0000307", + "", + "", + "2020-09-17", + "MGI", + "", + "creation-date=2020-09-17|modification-date=2020-09-17|contributor-id=http://orcid.org/0000-0003-2689-5511" + ] + result = to_association(list(vals), report=report, version=version) + assert result.skipped == 1 + assert len([m for m in result.report.messages if m["level"] == "ERROR"]) == 1 + assert len(result.associations) == 0 def test_aspect_fill_for_obsolete_terms():