diff --git a/src/spinneret/annotator.py b/src/spinneret/annotator.py index db9df37..35b9c73 100644 --- a/src/spinneret/annotator.py +++ b/src/spinneret/annotator.py @@ -4,6 +4,7 @@ from typing import Union from requests import get, exceptions import pandas as pd +from lxml import etree # pylint: disable=too-many-locals @@ -179,3 +180,80 @@ def annotate_workbook(workbook_path: str, output_path: str) -> None: # Write the annotated workbook back to the original path wb.to_csv(output_path, sep="\t", index=False, encoding="utf-8") + + +def annotate_eml(eml_path: str, workbook_path: str, output_path: str) -> None: + """Annotate an EML file with terms from the corresponding workbook + + :param eml_path: The path to the EML file to be annotated. + :param workbook_path: The path to the workbook corresponding to the EML file. + :param output_path: The path to write the annotated EML file. + :returns: None + + :notes: The EML file is annotated with terms from the corresponding workbook. + Terms from the workbook are added even if they are already present in + the EML file. + """ + # Load the EML and workbook for processing + eml = etree.parse(eml_path, parser=etree.XMLParser(remove_blank_text=True)) + wb = pd.read_csv(workbook_path, sep="\t", encoding="utf-8") + + # Iterate over workbook rows and annotate the EML + for _, row in wb.iterrows(): + + # Only annotate if required components are present + if ( + not pd.isnull(row["predicate"]) + and not pd.isnull(row["predicate_id"]) + and not pd.isnull(row["object"]) + and not pd.isnull(row["object_id"]) + ): + # Create the annotation element + annotation = create_annotation_element( + predicate_label=row["predicate"], + predicate_id=row["predicate_id"], + object_label=row["object"], + object_id=row["object_id"], + ) + + # Insert the annotation + if row["element"] == "dataset": + # Insert the annotation before the contact element to correctly + # locate dataset level annotations in the EML, we use a + # consistent reference point that is required by the EML + # schema. + root = eml.getroot() + dataset = root.find(".//dataset") + contact = dataset.find("contact") + dataset.insert(dataset.index(contact), annotation) + elif row["element"] == "attribute": + # Convert absolute XPath to relative path to avoid errors + attribute_xpath = row["element_xpath"].replace("/eml:eml", "./") + # Insert the annotation at the end of the attribute list. + root = eml.getroot() + attribute = root.find(attribute_xpath) + attribute.insert(len(attribute) + 1, annotation) + + # Write eml to file + eml.write(output_path, pretty_print=True, encoding="utf-8", xml_declaration=True) + + +def create_annotation_element(predicate_label, predicate_id, object_label, object_id): + """Create an EML annotation element + + :param predicate_label: The predicate label of the annotation. + :param predicate_id: The URI of the predicate. + :param object_label: The object label of the annotation. + :param object_id: The URI of the object. + """ + annotation_elem = etree.Element("annotation") + + property_uri_elem = etree.SubElement(annotation_elem, "propertyURI") + property_uri_elem.attrib["label"] = predicate_label + property_uri_elem.text = predicate_id + + value_uri_elem = etree.SubElement(annotation_elem, "valueURI") + value_uri_elem.attrib["label"] = object_label + value_uri_elem.text = object_id + + return annotation_elem diff --git a/tests/edi.3.9_annotated.xml b/tests/edi.3.9_annotated.xml new file mode 100644 index 0000000..2ca7b18 --- /dev/null +++ b/tests/edi.3.9_annotated.xml @@ -0,0 +1,1004 @@ + + + + + uid=EDI,o=EDI,dc=edirepository,dc=org + all + + + uid=SBC,o=EDI,dc=ecoinformatics,dc=org + all + + + uid=sbcmbon,o=EDI,dc=edirepository,dc=org + all + + + uid=lkui,dc=ecoinformatics,dc=org + all + + + public + read + + + + doi:10.6073/pasta/6cf47d3a8310368f62c75e6b6e63076a + kelp forest integrated cover upc rpc + Santa Barbara Channel Marine BON: Nearshore kelp forest integrated benthic cover, 1980-ongoing + + SCB Marine Biodiversity Observation Network +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ sbcbon@msi.ucsb.edu +
+ + + Robert J + Miller + + UCSB +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ 805-893-6174 + miller@msi.ucsb.edu + https://orcid.org/0000-0002-8350-3759 +
+ + + Andrew R + Rassweiler + + UCSB +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ 805-893-7823 + andrew.rassweiler@lifesci.ucsb.edu + https://orcid.org/0000-0002-8760-3888 +
+ + + Jenn + Caselle + + UCSB +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ 805-893-5144 + caselle@ucsb.edu +
+ + + David + Kushner + + NPS +
+ National Park Service + Ventura + CA + US +
+ 805-658-5773 + david_kushner@nps.gov +
+ + + Daniel C + Reed + + UCSB +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ 805-893-8363 + dan.reed@lifesci.ucsb.edu + https://orcid.org/0000-0003-3015-8717 +
+ + + Kevin D + Lafferty + + USGS +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ Klafferty@usgs.gov + https://orcid.org/0000-0001-7583-4593 +
+ + + Li + Kui + + UCSB +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ lkui@ucsb.edu + https://orcid.org/0000-0002-5894-4907 +
+ + + Margaret + O'Brien + + UCSB +
+ Marine Science Institute University of California + Santa Barbara + CA + 93106-6150 + US +
+ margaret.obrien@ucsb.edu + https://orcid.org/0000-0002-1693-8322 +
+ 2021-03-02 + English + + + The Santa Barbara Channel Marine Biodiversity Observation Network + (SBCMBON) tracks long-term patterns in species abundance and + diversity. This dataset contains cover of kelp forest sessile + invertebrates, understory macroalgae, and substrate types by + integrating data from four contributing projects working in the kelp + forests of the Santa Barbara Channel, USA. Divers collect data on + using either uniform point contact (UPC) or random point contact (RPC) + methods. + + + The four contributing projects are two research projects: The Santa + Barbara Coastal LTER (SBC LTER) and the Partnership for + Interdisciplinary Studies of Coastal Oceans (PISCO), the kelp forest + monitoring program of the Santa Barbara Channel National Park, and the + San Nicolas Island monitoring program supported by USGS. Together, + these projects have recorded data for more than 200 species at + approximately 100 sites on both the mainland coast and on the Santa + Barbara Channel Islands. Sampling began in 1982 and is ongoing. Data + were collected by human observation (divers using SCUBA) during + regular surveys. + + + Percent cover is recorded for taxa where individuals cannot be + counted. Cover can be calculated from the data here as the fraction of + total points at which the taxon was present x 100. With UPC and RPC + methods, multiple species can be recorded at any given point. The + total percent cover of all species combined using this method can + exceed 100%; however, the percent cover of any single species cannot + exceed 100%. See Methods for information on integration and data + processing. + + + MBON is funded by National Aeronautics and Space Administration + (NASA), Bureau of Ocean Energy Management (BOEM), and National Oceanic + and Atmospheric Administration (NOAA). + + + For users who are interested in using all or part of this integrated + datasets, please contact data owners to discuss your research + interests, data-related issues or any other questions. A recommended + citation for the data package is available from the download page. In + addition, any manuscript generated using this dataset is expected to + be sent to the data owners before publication so we can be sure the + data is used in the proper context and methods are reported + accurately: + + + Santa Barbara Coastal LTER (LTER): + + + Dan Reed dan.reed@lifesci.ucsb.edu + + + Robert Miller miller@msi.ucsb.edu + + + Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO): + + + Jenn Caselle caselle@ucsb.edu + + + Kelp forest monitoring (KFM): + + + David Kushner david_kushner@nps.gov + + + Joshua Sprague joshua_sprague@nps.gov + + + San Nicolas Island monitoring (SNI): + + + Kevin Lafferty Klafferty@usgs.gov + + + Mike Kenner mkenner@ucsc.edu + + + + Population Abundance + Essential Biodiversity Variables + + + BasisofRecord: HumanObservation + Occurrence: OrganismQuantity + Taxon: ScientificName + Darwin Core Terms + + + algae + invertebrate + random point contact + Santa Barbara Channel Marine BON + uniform point contact + none + + + + This data package is released under the Creative Commons License + Attribution 4.0 International (CC BY 4.0, see + https://creativecommons.org/licenses/by/4.0/). This license states + that consumers ("Data Users" herein) may distribute, adapt, + reuse, remix, and build upon this work, as long as they give + appropriate credit, provide a link to the license, and indicate if + changes were made. If redistributed, a Data User may not apply + additional restrictions or technological measures that prevent access. + + + The Data User has an ethical obligation to cite the data source + appropriately in any publication or product that results from its use, + and notify the data contact or creator. Communication, collaboration, + or co-authorship (as appropriate) with the creators of this data + package is encouraged to prevent duplicate research or publication. + The Data User is urged to contact the authors of these data if any + questions about methodology or results occur. The Data User should + realize that these data may be actively used by others for ongoing + research and that coordination may be necessary to prevent duplication + or inappropriate use. The Data User should realize that + misinterpretation may occur if data are used outside of the context of + the original study. The Data User should be aware that data are + updated periodically and it is the responsibility of the Data User to + check for new versions of the data. + + + While substantial efforts are made to ensure the accuracy of data and + associated documentation, complete accuracy of data sets cannot be + guaranteed. This data package (with its components) is made available + “as is” and with no warranty of accuracy or fitness for use. The + creators of this data package and the repository where these data were + obtained shall not be liable for any damages resulting from + misinterpretation, use or misuse of the data package or its + components. + + + + + http://sbc.marinebon.org/ + + + + + California, USA: Nearshore reefs of the Santa Barbara Channel and Channel Islands, California, USA + + -120.65022 + -118.4 + 34.87315 + 32.8 + + + + + + 1980-08-25 + + + 2020-07-31 + + + + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_03000117 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_01000058 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_01000335 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_00000562 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_00000098 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_01000687 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/PATO_0000467 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_00000015 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/ENVO_2000015 + + + http://purl.obolibrary.org/obo/IAO_0000136 + http://purl.obolibrary.org/obo/PATO_0000057 + + + SCB MBON + Information Manager, Southern California Bight Marine Biodiversity Observation Network +
+ Marine Science Institute + University of California + Santa Barbara + California + 93106-6150 + United States +
+ sbcbon@msi.ucsb.edu + http://sbc.marinebon.org +
+ + Environmental Data Initiative + info@environmentaldatainitiative.org + https://environmentaldatainitiative.org + + Environmental Data Initiative + + + + + The integrated dataset combines four monitoring projects in the kelp + forests of the Santa Barbara Channel, USA. The surveying methods, raw + data, and meta data files for each project can be found online. + + + Santa Barbara Coastal LTER (LTER): + https://sbclter.msi.ucsb.edu/data/catalog/ + + + Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO): + https://data.piscoweb.org/metacatui/view/doi:10.6085/AA/PISCO_kelpforest.1.6 + + + Kelp forest monitoring (KFM): + http://www.esapubs.org/archive/ecol/E094/245/ + + + San Nicolas Island monitoring (SNI): + http://www.esapubs.org/archive/ecol/E094/244/ + + + The data integration process includes data cleaning, column + standardization, species validation, and data merging. The data + cleaning was processed in seven steps: 1. Select the sites located + within the Santa Barbara Channel; 2. Separate the biological survey + data into three categories: fish survey, quad and swath count, and + benthic cover; 3. Assign survey area to the corresponding record for + each survey, and calculate the total point count for the benthic cover + data; 4. Add life stage (adult and juvenile) information to the survey + record; 5. Remove redundant rows; 6. Convert date format into ISO + compliant format (YYYY-MM-DD). + + + After the cleaning process, the designated columns (for the integrated + dataset) were selected and standardized indices were generated for + site, subsite, transect, replicate plot, and taxon. Species’ + scientific name were run through an online data base + http://www.marinespecies.org/ to obtain an authoritative name and + taxon information. Finally, the standardized-format table from the + four projects were merged into one csv file for each survey category. + + + For researchers who want to use the integrated datasets, there are + some potential issues and cautions due to the difference among + sampling methods of these four programs. Please refer to the full + protocol (PDF) listed below for additional details. + + + + Integrated data post processing for data analysis + + + Kui + + + + + http://sbc.marinebon.org/Metadata/Protocols/MBON_post_data_processing_20181130.pdf + + + + + + This method step describes provenance-based metadata.This provenance metadata does not contain entity specific information. + + PISCO Kelp Forest Community Surveys + + + Mark + Carr + + mhcarr@ucsc.edu + https://orcid.org/0000-0001-9644-7680 + + + + This online link references an EML document that describes data used in the creation of this derivative data package. + https://data.piscoweb.org/metacatui/view/doi:10.6085/AA/PISCO_kelpforest.1.6 + + + + + Mark + Carr + + mhcarr@ucsc.edu + + + + + + Southern California Bight Marine Biodiversity Observation Network + + + Dr. + Robert J + Miller + +
+ Marine Science Institute + University of California + Santa Barbara + California + 93106-6150 + United States +
+ 805 893 6174 + miller@msi.ucsb.edu + Principal Investigator +
+ + + Dr. + Daniel + Reed + +
+ Marine Science Institute + University of California + Santa Barbara + California + 93106-6150 + United States +
+ 805 893 8363 + reed@lifesci.ucsb.edu + Co-Principal Investigator +
+ + + Dr. + David + Siegel + +
+ Institute for Computational Earth System Science + University of California + Santa Barbara + California + 93106-3060 + United States +
+ 805 893 4547 + davey@icess.ucsb.edu + Co-Principal Investigator +
+ + + Dr. + Craig + Carlson + +
+ Department of Ecology, Evolution and Marine Biology + University of California + Santa Barbara + California + 93106-9620 + United States +
+ 805 893 2541 + craig.carlson@lifesci.ucsb.edu + Co-Principal Investigator +
+ + + Dr. + Kevin D + Lafferty + +
+ US Geological Survey Western Ecological Research Center + University of California + Santa Barbara + California + 93106 + United States +
+ Klafferty@usgs.gov + Co-Principal Investigator +
+ + + Dr. + B.S. + Manjunath + +
+ Department of Electrical and Computer Engineering + University of California + Santa Barbara + California + 93106-9560 + United States +
+ 805 893 7112 + manj@ece.ucsb.edu + Co-Principal Investigator +
+ + + Dr. + Andrew + Rassweiler + +
+ Department of Biological Science + Florida State University + Tallahassee + Florida + 32306-4295 + United States +
+ 850 644 1555 + rassweiler@bio.fsu.edu + Co-Principal Investigator +
+ +
+ The Southern California Bight Marine Biodiversity Observation Network (SCB MBON) is + designed to provide a complete picture of marine biodiversity in the region. SCB MBON is + developing a widely applicable research model that integrates new information with + existing data to improve current research and monitoring programs and provide greater + insight into marine biodiversity. +
+
+ +
+ MBON is funded by National Aeronautics and Space Administration (NASA), Bureau of + Ocean Energy Management (BOEM), and National Oceanic and Atmospheric Administration + (NOAA). +
+
+
+ + SBCMBON kelp forest integrated benthic cover biological survey + Cover from four projects using UPC and RPC methods + + SBCMBON_kelp_forest_integrated_benthic_cover_20210120.csv + 409555780 + a1d0f0250c2570b89e0fffbbea13337e + + + 1 + \r\n + column + + , + no + " + + + + + + https://pasta.lternet.edu/package/data/eml/edi/3/9/07339606e346313dba6608b9300d1717 + + + + + + data_source + Data source + Source project for this data + string + + + + + + kfm + Kelp forest monitoring program of the Santa Barbara Channel National Park + + + lter + Santa Barbara Coastal LTER + + + pisco + Partnership for Interdisciplinary Studies of Coastal Oceans + + + sni + San Nicolas Island monitoring program + + + + + + + + sample_method + Sampling method + Sampling method + string + + + + + any text + + + + + + + date + Date + Date of survey + date + + + YYYY-MM-DD + 1 + + + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00002051 + + + + site_id + Site ID + ID of a site, assigned by each project + string + + + + + any text + + + + + + + subsite_id + Subsite ID + Identifier for the subsite,one level below site + string + + + + + any text + + + + + + + transect_id + Transect ID + Identifier for the transect + string + + + + + any text + + + + + + + replicate_id + Replicate ID + Identifier for the replicate + string + + + + + any text + + + + + + + proj_taxon_id + Project-taxon code + Code assigned by SBC MBON for this taxon from this data source (project) + string + + + + + [a-z]-[a-z]-[0-9][0-9][0-9] + + + + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00002744 + + + + points + Points + Number of total points counted on a UPC or RPC survey + float + + + + number + + 1 + + real + + + + + + count + Count + Number of organisms counted + float + + + + number + + 1 + + integer + + + + + . + value not recorded or not available + + + + auth_taxon_id + Authoritative Taxon Code + Taxon code assigned by an authoritative source + string + + + + + any text + + + + + + . + value not recorded or not available + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00002608 + + + + auth_name + Authoritative Taxon Code Source + Name of the athority or registry assigning the Authoritative Taxon Code + string + + + + + any text + + + + + + . + value not recorded or not available + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00001193 + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00002608 + + + + taxon_name + Taxon name + Taxon name, usually species binomial or other taxon name + string + + + + + any text + + + + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00001193 + + + http://ecoinformatics.org/oboe/oboe.1.2/oboe-core.owl#containsMeasurementsOfType + http://purl.dataone.org/odo/ECSO_00000313 + + + + site_name + Site name + The site, as named by each project + string + + + + + any text + + + + + + + subsite_name + Subsite name + Survey region within a site + string + + + + + any text + + + + + + . + value not recorded or not available + + + + latitude + Latitude + Site latitude + float + + + + degree + + 1e-04 + + real + + + + + . + value not recorded or not available + + + + longitude + Longitude + Site longitude + float + + + + degree + + 1e-04 + + real + + + + + . + value not recorded or not available + + + + 2826412 + +
+ + + + + 360 degrees comprise a unit circle. + + + a number + + + + +
diff --git a/tests/test_annotator.py b/tests/test_annotator.py index 69b3a47..d527ab8 100644 --- a/tests/test_annotator.py +++ b/tests/test_annotator.py @@ -4,8 +4,15 @@ from shutil import copyfile import pytest import pandas as pd -from spinneret.annotator import get_bioportal_annotation, annotate_workbook +from lxml import etree +from spinneret.annotator import ( + get_bioportal_annotation, + annotate_workbook, + annotate_eml, + create_annotation_element, +) from spinneret.utilities import load_configuration +from spinneret.datasets import get_example_eml_dir def test_get_bioportal_annotation(): @@ -81,3 +88,43 @@ def test_annotate_workbook(tmp_path): # The columns to be annotated should be full for col in cols_to_annotate: assert not wb[col].isnull().all() + + +def test_annotate_eml(tmp_path): + """Test annotate_eml""" + eml_file = get_example_eml_dir() + "/" + "edi.3.9.xml" + wb_file = "tests/edi.3.9_annotation_workbook_annotated.tsv" + output_file = str(tmp_path) + "/edi.3.9_annotated.xml" + + # Check that there are no annotations in the EML file + eml = etree.parse(eml_file) + assert eml.xpath(".//annotation") == [] + + # Annotate the EML file + annotate_eml(eml_path=eml_file, workbook_path=wb_file, output_path=output_file) + + # Check that the EML file was annotated + assert os.path.exists(output_file) + eml_annotated = etree.parse(output_file) + annotations = eml_annotated.xpath(".//annotation") + assert annotations != [] + # The number of annotation elements should be equal to the number of rows + # in the workbook where predicates and objects are both present. + wb = pd.read_csv(wb_file, sep="\t", encoding="utf-8") + num_rows = len( + wb.dropna(subset=["predicate", "predicate_id", "object", "object_id"]) + ) + assert len(annotations) == num_rows + + +# pylint: disable=line-too-long +def test_create_annotation_element(): + """Test create_annotation_element""" + fixture = """predicate_idobject_id""" + annotation_element = create_annotation_element( + predicate_label="predicate_label", + predicate_id="predicate_id", + object_label="object_label", + object_id="object_id", + ) + assert bytes.decode(etree.tostring(annotation_element)) == fixture