Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Parsers: parsing green and golden access from larger set #24

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM apache/airflow:2.8.2-python3.10
FROM registry.cern.ch/cern-sis/airflow-base:2.8.3

ENV PYTHONBUFFERED=0
ENV AIRFLOW__LOGGING__LOGGING_LEVEL=INFO
Expand Down
7 changes: 2 additions & 5 deletions dags/open_access/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+" + r"not+540__f:Bronze+not+540__3:preprint"
)
BRONZE_ACCESS = r"540__f:'Bronze'"
GREEN_ACCESS = (
r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+not+540__a:"
+ r"'arXiv+nonexclusive-distrib'+not+540__f:'Bronze'"
)
GOLD_ACCESS = r"540__3:'publication'+and+" + r"(540__a:'CC-BY'+OR++540__a:'CC+BY')"
GREEN_ACCESS = r""
GOLD_ACCESS = r""

CERN_READ_AND_PUBLISH = r"540__f:'CERN-RP"
CERN_INDIVIDUAL_APCS = r"540__f:'CERN-APC'"
Expand Down
11 changes: 4 additions & 7 deletions dags/open_access/open_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,19 @@ def oa_dag():
@task(executor_config=kubernetes_executor_config)
def fetch_data_task(query, **kwargs):
year = kwargs["params"].get("year")
cds_token = os.environ.get("CDS_TOKEN")
if not cds_token:
logging.warning("cds token is not set!")
base_query = (
r"(affiliation:CERN+or+595:'For+annual+report')"
+ rf"and+year:{year}+not+980:ConferencePaper+"
+ r"not+980:BookChapter"
)
type_of_query = [*query][0]
url = utils.get_url(query=f"{base_query}+{query[type_of_query]}")
data = request_again_if_failed(url=url, cds_token=cds_token)
url = utils.get_url(query=f"{base_query}")
data = request_again_if_failed(url=url)
total = get_total_results_count(data.text)
if type_of_query == "gold":
total = utils.get_gold_access_count(total, url)
total = utils.get_golden_access_count(total, url)
if type_of_query == "green":
total = total - utils.get_gold_access_count(total, url)
total = utils.get_green_access_count(total, url)
return {type_of_query: total}

@task(multiple_outputs=True, executor_config=kubernetes_executor_config)
Expand Down
139 changes: 119 additions & 20 deletions dags/open_access/parsers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import xml.etree.ElementTree as ET
from io import StringIO

Expand All @@ -13,25 +14,123 @@ def parse_without_names_spaces(xml):
return root


def get_golden_access_records_ids(data):
xml = parse_without_names_spaces(data)
records = xml.findall(".record")
golden_access = []
def is_correct_value(value):
match value.text.lower():
case "accepted manuscript":
return True
case "preprint":
return True
case _:
return False


def field_has_cc_by(field_value):
# is CC BY-SA 4.0 falls under the condition of "contains ‘CC-BY’ or ‘CC BY’??
#
pattern = re.compile(r"CC(\s|-)?BY(\s|-)?4.0", flags=re.I)
return bool(pattern.match(field_value))


def is_subset_856_for_green_access(datafields_856):
at_least_one_found = False
for datafield in datafields_856:
subfield = datafield.find("subfield[@code='y']")
try:
is_subfield_y_wanted_value = is_correct_value(subfield)
if not at_least_one_found:
at_least_one_found = is_subfield_y_wanted_value
at_least_one_found = is_subfield_y_wanted_value
except AttributeError:
pass
return at_least_one_found


def is_subset_540_preprint_green_access(datafields_540):
at_least_one_found = False
for datafield in datafields_540:
subfield_3 = datafield.find("subfield[@code='3']")
try:
is_subfield_3_wanted_value = subfield_3.text.lower() == "preprint"
if not at_least_one_found:
at_least_one_found = is_subfield_3_wanted_value
except AttributeError:
pass
return at_least_one_found


def is_subset_540_publication_golden_access(datafields_540):
at_least_one_found = False
for datafield in datafields_540:
subfield_3 = datafield.find("subfield[@code='3']")
subfield_a = datafield.find("subfield[@code='a']")
try:
is_subfield_wanted_3_value = subfield_3.text.lower() == "publication"
is_subfield_a_wanted_value = field_has_cc_by(subfield_a.text)
if not at_least_one_found:
at_least_one_found = bool(
is_subfield_wanted_3_value and is_subfield_a_wanted_value
)
except AttributeError:
pass
return at_least_one_found


def parse_subset_green_access(records):
filtered_records = []
for record in records:
datafields_856 = record.findall("datafield[@tag='856'][@ind1='4'][@ind2=' ']")
datafields_540 = record.findall("datafield/[@tag='540']")
if datafields_856 is None:
continue
if datafields_540 is None:
continue
is_it_wanted_record_by_856 = is_subset_856_for_green_access(datafields_856)
is_it_wanted_record_by_540_preprint = is_subset_540_preprint_green_access(datafields_540)
is_it_wanted_record_by_540_publication = not is_subset_540_publication_golden_access(
datafields_540
)

if (
is_it_wanted_record_by_856
or is_it_wanted_record_by_540_preprint
or is_it_wanted_record_by_540_publication
):
filtered_records.append(record)

return filtered_records


def parse_subset_golden_access(records):
filtered_records = []
for record in records:
datafields = record.findall("datafield/[@tag='540']")
if datafields is None:
datafields_540 = record.findall("datafield/[@tag='540']")
if datafields_540 is None:
continue
for datafield in datafields:
record_type = datafield.find("subfield/[@code='3']")
license = datafield.find("subfield/[@code='a']")
if record_type is not None and license is not None:
if (
"CC" in license.text
and "BY" in license.text
and record_type.text == "publication"
):
record_id = record.find("controlfield/[@tag='001']")
if record_id is not None:
doi = record_id.text
golden_access.append(doi)
return golden_access
is_it_wanted_record_by_540_publication = is_subset_540_publication_golden_access(
datafields_540
)

if is_it_wanted_record_by_540_publication:
filtered_records.append(record)
return filtered_records


def get_records_ids(data, record_filter):
xml = parse_without_names_spaces(data)
records = xml.findall(".record")
filtered_records = record_filter(records)
green_access = []
for record in filtered_records:
record_id = record.find("controlfield/[@tag='001']")
if record_id is not None:
doi = record_id.text
green_access.append(doi)
return green_access


def get_golden_access_records_ids(data):
return get_records_ids(data, parse_subset_golden_access)


def get_green_access_records_ids(data):
return get_records_ids(data, parse_subset_green_access)
17 changes: 12 additions & 5 deletions dags/open_access/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,30 @@
import math

from common.utils import request_again_if_failed
from open_access.parsers import get_golden_access_records_ids
from open_access.parsers import (get_golden_access_records_ids,
get_green_access_records_ids)


def get_gold_access_count(total, url):
def get_count(total, url, record_extractor):
iterations = math.ceil(total / 100.0)
records_ids_count = 0
for i in range(0, iterations):
jrec = (i * 100) + 1
full_url = f"{url}&jrec={jrec}"
response = request_again_if_failed(full_url)
records_ids_count = records_ids_count + len(
get_golden_access_records_ids(response.text)
)
records_ids_count = records_ids_count + len(record_extractor(response.text))
logging.info(f"In total was found {records_ids_count} golden access records")
return records_ids_count


def get_golden_access_count(total, url):
return get_count(total, url, get_golden_access_records_ids)


def get_green_access_count(total, url):
return get_count(total, url, get_green_access_records_ids)


def get_url(query, current_collection="Published+Articles"):
url = (
rf"https://cds.cern.ch/search?ln=en&cc={current_collection}&p={query}"
Expand Down
8 changes: 4 additions & 4 deletions requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pre-commit==3.6.2
pytest==7.4.4
coverage==7.4.3
pytest-cov==4.1.0
pre-commit
pytest
coverage
pytest-cov
pytest-datadir==1.5.0
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.3/constraints-3.10.txt
apache-airflow[celery, postgres, redis, cncf.kubernetes]==2.8.3
alembic==1.13.1
alembic
airflow-provider-alembic==1.0.0
elementpath==4.4.0
83 changes: 80 additions & 3 deletions tests/open_access/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from open_access.parsers import get_golden_access_records_ids
from open_access.parsers import (get_golden_access_records_ids,
get_green_access_records_ids,
is_subset_540_preprint_green_access,
is_subset_540_publication_golden_access, is_subset_856_for_green_access,
parse_without_names_spaces)

expected = [
expected_golden = [
"2894668",
"2891488",
"2888511",
Expand All @@ -11,6 +15,28 @@
"2882429",
"2882335",
"2882328",
"2882324",
"2882322",
"2882311",
"2882298",
]

expected_green = [
"2894668",
"2891489",
"2891488",
"2891487",
"2888511",
"2888151",
"2886038",
"2884472",
"2884471",
"2884470",
"2884469",
"2883672",
"2882429",
"2882335",
"2882328",
"2882327",
"2882324",
"2882322",
Expand All @@ -22,4 +48,55 @@
def test_get_golden_access_records_dois(shared_datadir):
with open(shared_datadir / "search.xml") as file:
records_ids = get_golden_access_records_ids(file.read())
assert records_ids == expected
assert records_ids == expected_golden


def test_parse_subset_856(shared_datadir):
with open(shared_datadir / "search.xml") as file:
filtered_records_count = 0
parsed_records = parse_without_names_spaces(file.read())
records = parsed_records.findall(".record")
for record in records:
datafields_856 = record.findall(
"datafield[@tag='856'][@ind1='4'][@ind2=' ']"
)
is_it_wanted_record_by_856 = is_subset_856_for_green_access(datafields_856)
if is_it_wanted_record_by_856:
filtered_records_count = filtered_records_count + 1
assert filtered_records_count == 0


def test_parse_subset_540_preprint(shared_datadir):
with open(shared_datadir / "search.xml") as file:
filtered_records_count = 0
parsed_records = parse_without_names_spaces(file.read())
records = parsed_records.findall(".record")
for record in records:
datafields_540 = record.findall(
"datafield[@tag='540'][@ind1=' '][@ind2=' ']"
)
is_it_wanted_record_by_540 = is_subset_540_preprint_green_access(datafields_540)
if is_it_wanted_record_by_540:
filtered_records_count = filtered_records_count + 1
assert filtered_records_count == 20


def test_parse_subset_540_publications(shared_datadir):
with open(shared_datadir / "search.xml") as file:
filtered_records_count = 0
parsed_records = parse_without_names_spaces(file.read())
records = parsed_records.findall(".record")
for record in records:
datafields_540 = record.findall(
"datafield[@tag='540'][@ind1=' '][@ind2=' ']"
)
is_it_wanted_record_by_540 = is_subset_540_publication_golden_access(datafields_540)
if is_it_wanted_record_by_540:
filtered_records_count = filtered_records_count + 1
assert filtered_records_count == 14


def test_get_green_access_records_dois(shared_datadir):
with open(shared_datadir / "search.xml") as file:
records_ids = get_green_access_records_ids(file.read())
assert records_ids == expected_green
Loading