From 4f062d0f1c25d5722b5a21e37487b473fc910e25 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Wed, 26 Jun 2024 15:14:00 +0300 Subject: [PATCH 1/3] Parsers: parsing green and golden access from larger set --- dags/open_access/constants.py | 7 +- dags/open_access/open_access.py | 11 +-- dags/open_access/parsers.py | 139 ++++++++++++++++++++++++++----- dags/open_access/utils.py | 17 ++-- tests/open_access/test_parser.py | 83 +++++++++++++++++- 5 files changed, 217 insertions(+), 40 deletions(-) diff --git a/dags/open_access/constants.py b/dags/open_access/constants.py index 294509c..1b68abe 100644 --- a/dags/open_access/constants.py +++ b/dags/open_access/constants.py @@ -2,11 +2,8 @@ r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+" + r"not+540__f:Bronze+not+540__3:preprint" ) BRONZE_ACCESS = r"540__f:'Bronze'" -GREEN_ACCESS = ( - r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+not+540__a:" - + r"'arXiv+nonexclusive-distrib'+not+540__f:'Bronze'" -) -GOLD_ACCESS = r"540__3:'publication'+and+" + r"(540__a:'CC-BY'+OR++540__a:'CC+BY')" +GREEN_ACCESS = r"" +GOLD_ACCESS = r"" CERN_READ_AND_PUBLISH = r"540__f:'CERN-RP" CERN_INDIVIDUAL_APCS = r"540__f:'CERN-APC'" diff --git a/dags/open_access/open_access.py b/dags/open_access/open_access.py index 9d221de..4619948 100644 --- a/dags/open_access/open_access.py +++ b/dags/open_access/open_access.py @@ -20,22 +20,19 @@ def oa_dag(): @task(executor_config=kubernetes_executor_config) def fetch_data_task(query, **kwargs): year = kwargs["params"].get("year") - cds_token = os.environ.get("CDS_TOKEN") - if not cds_token: - logging.warning("cds token is not set!") base_query = ( r"(affiliation:CERN+or+595:'For+annual+report')" + rf"and+year:{year}+not+980:ConferencePaper+" + r"not+980:BookChapter" ) type_of_query = [*query][0] - url = utils.get_url(query=f"{base_query}+{query[type_of_query]}") - data = request_again_if_failed(url=url, cds_token=cds_token) + url = utils.get_url(query=f"{base_query}") + data = request_again_if_failed(url=url) total = get_total_results_count(data.text) if type_of_query == "gold": - total = utils.get_gold_access_count(total, url) + total = utils.get_golden_access_count(total, url) if type_of_query == "green": - total = total - utils.get_gold_access_count(total, url) + total = utils.get_green_access_count(total, url) return {type_of_query: total} @task(multiple_outputs=True, executor_config=kubernetes_executor_config) diff --git a/dags/open_access/parsers.py b/dags/open_access/parsers.py index 55da0ca..65b7d33 100644 --- a/dags/open_access/parsers.py +++ b/dags/open_access/parsers.py @@ -1,3 +1,4 @@ +import re import xml.etree.ElementTree as ET from io import StringIO @@ -13,25 +14,123 @@ def parse_without_names_spaces(xml): return root -def get_golden_access_records_ids(data): - xml = parse_without_names_spaces(data) - records = xml.findall(".record") - golden_access = [] +def is_correct_value(value): + match value.text.lower(): + case "accepted manuscript": + return True + case "preprint": + return True + case _: + return False + + +def field_has_cc_by(field_value): + # is CC BY-SA 4.0 falls under the condition of "contains ‘CC-BY’ or ‘CC BY’?? + # + pattern = re.compile(r"CC(\s|-)?BY(\s|-)?4.0", flags=re.I) + return bool(pattern.match(field_value)) + + +def parse_subset_856(datafields_856): + at_least_one_found = False + for datafield in datafields_856: + subfield = datafield.find("subfield[@code='y']") + try: + is_subfield_y_wanted_value = is_correct_value(subfield) + if not at_least_one_found: + at_least_one_found = is_subfield_y_wanted_value + at_least_one_found = is_subfield_y_wanted_value + except AttributeError: + pass + return at_least_one_found + + +def parse_subset_540_preprint(datafields_540): + at_least_one_found = False + for datafield in datafields_540: + subfield_3 = datafield.find("subfield[@code='3']") + try: + is_subfield_3_wanted_value = subfield_3.text.lower() == "preprint" + if not at_least_one_found: + at_least_one_found = is_subfield_3_wanted_value + except AttributeError: + pass + return at_least_one_found + + +def parse_subset_540_publication(datafields_540): + at_least_one_found = False + for datafield in datafields_540: + subfield_3 = datafield.find("subfield[@code='3']") + subfield_a = datafield.find("subfield[@code='a']") + try: + is_subfield_wanted_3_value = subfield_3.text.lower() == "publication" + is_subfield_a_wanted_value = field_has_cc_by(subfield_a.text) + if not at_least_one_found: + at_least_one_found = bool( + is_subfield_wanted_3_value and is_subfield_a_wanted_value + ) + except AttributeError: + pass + return at_least_one_found + + +def parse_subset_green_access(records): + filtered_records = [] + for record in records: + datafields_856 = record.findall("datafield[@tag='856'][@ind1='4'][@ind2=' ']") + datafields_540 = record.findall("datafield/[@tag='540']") + if datafields_856 is None: + continue + if datafields_540 is None: + continue + is_it_wanted_record_by_856 = parse_subset_856(datafields_856) + is_it_wanted_record_by_540_preprint = parse_subset_540_preprint(datafields_540) + is_it_wanted_record_by_540_publication = not parse_subset_540_publication( + datafields_540 + ) + + if ( + is_it_wanted_record_by_856 + or is_it_wanted_record_by_540_preprint + or is_it_wanted_record_by_540_publication + ): + filtered_records.append(record) + + return filtered_records + + +def parse_subset_golden_access(records): + filtered_records = [] for record in records: - datafields = record.findall("datafield/[@tag='540']") - if datafields is None: + datafields_540 = record.findall("datafield/[@tag='540']") + if datafields_540 is None: continue - for datafield in datafields: - record_type = datafield.find("subfield/[@code='3']") - license = datafield.find("subfield/[@code='a']") - if record_type is not None and license is not None: - if ( - "CC" in license.text - and "BY" in license.text - and record_type.text == "publication" - ): - record_id = record.find("controlfield/[@tag='001']") - if record_id is not None: - doi = record_id.text - golden_access.append(doi) - return golden_access + is_it_wanted_record_by_540_publication = parse_subset_540_publication( + datafields_540 + ) + + if is_it_wanted_record_by_540_publication: + filtered_records.append(record) + return filtered_records + + +def get_records_ids(data, record_filter): + xml = parse_without_names_spaces(data) + records = xml.findall(".record") + filtered_records = record_filter(records) + green_access = [] + for record in filtered_records: + record_id = record.find("controlfield/[@tag='001']") + if record_id is not None: + doi = record_id.text + green_access.append(doi) + return green_access + + +def get_golden_access_records_ids(data): + return get_records_ids(data, parse_subset_golden_access) + + +def get_green_access_records_ids(data): + return get_records_ids(data, parse_subset_green_access) diff --git a/dags/open_access/utils.py b/dags/open_access/utils.py index e2d101f..b682beb 100644 --- a/dags/open_access/utils.py +++ b/dags/open_access/utils.py @@ -2,23 +2,30 @@ import math from common.utils import request_again_if_failed -from open_access.parsers import get_golden_access_records_ids +from open_access.parsers import (get_golden_access_records_ids, + get_green_access_records_ids) -def get_gold_access_count(total, url): +def get_count(total, url, record_extractor): iterations = math.ceil(total / 100.0) records_ids_count = 0 for i in range(0, iterations): jrec = (i * 100) + 1 full_url = f"{url}&jrec={jrec}" response = request_again_if_failed(full_url) - records_ids_count = records_ids_count + len( - get_golden_access_records_ids(response.text) - ) + records_ids_count = records_ids_count + len(record_extractor(response.text)) logging.info(f"In total was found {records_ids_count} golden access records") return records_ids_count +def get_golden_access_count(total, url): + return get_count(total, url, get_golden_access_records_ids) + + +def get_green_access_count(total, url): + return get_count(total, url, get_green_access_records_ids) + + def get_url(query, current_collection="Published+Articles"): url = ( rf"https://cds.cern.ch/search?ln=en&cc={current_collection}&p={query}" diff --git a/tests/open_access/test_parser.py b/tests/open_access/test_parser.py index 599a76d..d1ff1da 100644 --- a/tests/open_access/test_parser.py +++ b/tests/open_access/test_parser.py @@ -1,6 +1,10 @@ -from open_access.parsers import get_golden_access_records_ids +from open_access.parsers import (get_golden_access_records_ids, + get_green_access_records_ids, + parse_subset_540_preprint, + parse_subset_540_publication, parse_subset_856, + parse_without_names_spaces) -expected = [ +expected_golden = [ "2894668", "2891488", "2888511", @@ -11,6 +15,28 @@ "2882429", "2882335", "2882328", + "2882324", + "2882322", + "2882311", + "2882298", +] + +expected_green = [ + "2894668", + "2891489", + "2891488", + "2891487", + "2888511", + "2888151", + "2886038", + "2884472", + "2884471", + "2884470", + "2884469", + "2883672", + "2882429", + "2882335", + "2882328", "2882327", "2882324", "2882322", @@ -22,4 +48,55 @@ def test_get_golden_access_records_dois(shared_datadir): with open(shared_datadir / "search.xml") as file: records_ids = get_golden_access_records_ids(file.read()) - assert records_ids == expected + assert records_ids == expected_golden + + +def test_parse_subset_856(shared_datadir): + with open(shared_datadir / "search.xml") as file: + filtered_records_count = 0 + parsed_records = parse_without_names_spaces(file.read()) + records = parsed_records.findall(".record") + for record in records: + datafields_856 = record.findall( + "datafield[@tag='856'][@ind1='4'][@ind2=' ']" + ) + is_it_wanted_record_by_856 = parse_subset_856(datafields_856) + if is_it_wanted_record_by_856: + filtered_records_count = filtered_records_count + 1 + assert filtered_records_count == 0 + + +def test_parse_subset_540_preprint(shared_datadir): + with open(shared_datadir / "search.xml") as file: + filtered_records_count = 0 + parsed_records = parse_without_names_spaces(file.read()) + records = parsed_records.findall(".record") + for record in records: + datafields_540 = record.findall( + "datafield[@tag='540'][@ind1=' '][@ind2=' ']" + ) + is_it_wanted_record_by_540 = parse_subset_540_preprint(datafields_540) + if is_it_wanted_record_by_540: + filtered_records_count = filtered_records_count + 1 + assert filtered_records_count == 20 + + +def test_parse_subset_540_publications(shared_datadir): + with open(shared_datadir / "search.xml") as file: + filtered_records_count = 0 + parsed_records = parse_without_names_spaces(file.read()) + records = parsed_records.findall(".record") + for record in records: + datafields_540 = record.findall( + "datafield[@tag='540'][@ind1=' '][@ind2=' ']" + ) + is_it_wanted_record_by_540 = parse_subset_540_publication(datafields_540) + if is_it_wanted_record_by_540: + filtered_records_count = filtered_records_count + 1 + assert filtered_records_count == 14 + + +def test_get_green_access_records_dois(shared_datadir): + with open(shared_datadir / "search.xml") as file: + records_ids = get_green_access_records_ids(file.read()) + assert records_ids == expected_green From bb0fd6b36af8c5597978822ae1298f7f5b84c428 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Wed, 26 Jun 2024 15:37:58 +0300 Subject: [PATCH 2/3] Dockerfile: use our custom image --- Dockerfile | 2 +- requirements-test.txt | 8 ++++---- requirements.txt | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index b51ef49..f03c7a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM apache/airflow:2.8.2-python3.10 +FROM registry.cern.ch/cern-sis/airflow-base:2.8.3 ENV PYTHONBUFFERED=0 ENV AIRFLOW__LOGGING__LOGGING_LEVEL=INFO diff --git a/requirements-test.txt b/requirements-test.txt index c0cf4b9..96bd19a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ -pre-commit==3.6.2 -pytest==7.4.4 -coverage==7.4.3 -pytest-cov==4.1.0 +pre-commit +pytest +coverage +pytest-cov pytest-datadir==1.5.0 diff --git a/requirements.txt b/requirements.txt index 4e15ccb..1663ef4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.3/constraints-3.10.txt apache-airflow[celery, postgres, redis, cncf.kubernetes]==2.8.3 -alembic==1.13.1 +alembic airflow-provider-alembic==1.0.0 elementpath==4.4.0 From e14de78a120751bc67b6360e80587f8a052f0863 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Thu, 27 Jun 2024 10:22:29 +0300 Subject: [PATCH 3/3] Parsers: refactoring --- dags/open_access/parsers.py | 14 +++++++------- tests/open_access/test_parser.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dags/open_access/parsers.py b/dags/open_access/parsers.py index 65b7d33..978059a 100644 --- a/dags/open_access/parsers.py +++ b/dags/open_access/parsers.py @@ -31,7 +31,7 @@ def field_has_cc_by(field_value): return bool(pattern.match(field_value)) -def parse_subset_856(datafields_856): +def is_subset_856_for_green_access(datafields_856): at_least_one_found = False for datafield in datafields_856: subfield = datafield.find("subfield[@code='y']") @@ -45,7 +45,7 @@ def parse_subset_856(datafields_856): return at_least_one_found -def parse_subset_540_preprint(datafields_540): +def is_subset_540_preprint_green_access(datafields_540): at_least_one_found = False for datafield in datafields_540: subfield_3 = datafield.find("subfield[@code='3']") @@ -58,7 +58,7 @@ def parse_subset_540_preprint(datafields_540): return at_least_one_found -def parse_subset_540_publication(datafields_540): +def is_subset_540_publication_golden_access(datafields_540): at_least_one_found = False for datafield in datafields_540: subfield_3 = datafield.find("subfield[@code='3']") @@ -84,9 +84,9 @@ def parse_subset_green_access(records): continue if datafields_540 is None: continue - is_it_wanted_record_by_856 = parse_subset_856(datafields_856) - is_it_wanted_record_by_540_preprint = parse_subset_540_preprint(datafields_540) - is_it_wanted_record_by_540_publication = not parse_subset_540_publication( + is_it_wanted_record_by_856 = is_subset_856_for_green_access(datafields_856) + is_it_wanted_record_by_540_preprint = is_subset_540_preprint_green_access(datafields_540) + is_it_wanted_record_by_540_publication = not is_subset_540_publication_golden_access( datafields_540 ) @@ -106,7 +106,7 @@ def parse_subset_golden_access(records): datafields_540 = record.findall("datafield/[@tag='540']") if datafields_540 is None: continue - is_it_wanted_record_by_540_publication = parse_subset_540_publication( + is_it_wanted_record_by_540_publication = is_subset_540_publication_golden_access( datafields_540 ) diff --git a/tests/open_access/test_parser.py b/tests/open_access/test_parser.py index d1ff1da..8153ef3 100644 --- a/tests/open_access/test_parser.py +++ b/tests/open_access/test_parser.py @@ -1,7 +1,7 @@ from open_access.parsers import (get_golden_access_records_ids, get_green_access_records_ids, - parse_subset_540_preprint, - parse_subset_540_publication, parse_subset_856, + is_subset_540_preprint_green_access, + is_subset_540_publication_golden_access, is_subset_856_for_green_access, parse_without_names_spaces) expected_golden = [ @@ -60,7 +60,7 @@ def test_parse_subset_856(shared_datadir): datafields_856 = record.findall( "datafield[@tag='856'][@ind1='4'][@ind2=' ']" ) - is_it_wanted_record_by_856 = parse_subset_856(datafields_856) + is_it_wanted_record_by_856 = is_subset_856_for_green_access(datafields_856) if is_it_wanted_record_by_856: filtered_records_count = filtered_records_count + 1 assert filtered_records_count == 0 @@ -75,7 +75,7 @@ def test_parse_subset_540_preprint(shared_datadir): datafields_540 = record.findall( "datafield[@tag='540'][@ind1=' '][@ind2=' ']" ) - is_it_wanted_record_by_540 = parse_subset_540_preprint(datafields_540) + is_it_wanted_record_by_540 = is_subset_540_preprint_green_access(datafields_540) if is_it_wanted_record_by_540: filtered_records_count = filtered_records_count + 1 assert filtered_records_count == 20 @@ -90,7 +90,7 @@ def test_parse_subset_540_publications(shared_datadir): datafields_540 = record.findall( "datafield[@tag='540'][@ind1=' '][@ind2=' ']" ) - is_it_wanted_record_by_540 = parse_subset_540_publication(datafields_540) + is_it_wanted_record_by_540 = is_subset_540_publication_golden_access(datafields_540) if is_it_wanted_record_by_540: filtered_records_count = filtered_records_count + 1 assert filtered_records_count == 14