Parsers: parsing green and golden access from larger set

cern-sis · Jun 26, 2024 · 4ca46cf · 4ca46cf
1 parent 82dad35
commit 4ca46cf
Show file tree

Hide file tree

Showing 5 changed files with 215 additions and 35 deletions.
diff --git a/dags/open_access/constants.py b/dags/open_access/constants.py
@@ -2,11 +2,8 @@
     r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+" + r"not+540__f:Bronze+not+540__3:preprint"
 )
 BRONZE_ACCESS = r"540__f:'Bronze'"
-GREEN_ACCESS = (
-    r"not+540__a:'CC+BY'+not+540__a:'CC-BY'+not+540__a:"
-    + r"'arXiv+nonexclusive-distrib'+not+540__f:'Bronze'"
-)
-GOLD_ACCESS = r"540__3:'publication'+and+" + r"(540__a:'CC-BY'+OR++540__a:'CC+BY')"
+GREEN_ACCESS = r""
+GOLD_ACCESS = r""
 
 CERN_READ_AND_PUBLISH = r"540__f:'CERN-RP"
 CERN_INDIVIDUAL_APCS = r"540__f:'CERN-APC'"

diff --git a/dags/open_access/open_access.py b/dags/open_access/open_access.py
@@ -29,13 +29,13 @@ def fetch_data_task(query, **kwargs):
             + r"not+980:BookChapter"
         )
         type_of_query = [*query][0]
-        url = utils.get_url(query=f"{base_query}+{query[type_of_query]}")
+        url = utils.get_url(query=f"{base_query}")
         data = request_again_if_failed(url=url, cds_token=cds_token)
         total = get_total_results_count(data.text)
         if type_of_query == "gold":
             total = utils.get_gold_access_count(total, url)
         if type_of_query == "green":
-            total = total - utils.get_gold_access_count(total, url)
+            total = utils.get(total, url)
         return {type_of_query: total}
 
     @task(multiple_outputs=True, executor_config=kubernetes_executor_config)

diff --git a/dags/open_access/parsers.py b/dags/open_access/parsers.py
@@ -1,3 +1,4 @@
+import re
 import xml.etree.ElementTree as ET
 from io import StringIO
 
@@ -13,25 +14,123 @@ def parse_without_names_spaces(xml):
     return root
 
 
-def get_golden_access_records_ids(data):
-    xml = parse_without_names_spaces(data)
-    records = xml.findall(".record")
-    golden_access = []
+def is_correct_value(value):
+    match value.text.lower():
+        case "accepted manuscript":
+            return True
+        case "preprint":
+            return True
+        case _:
+            return False
+
+
+def field_has_cc_by(field_value):
+    # is CC BY-SA 4.0 falls under the condition of "contains ‘CC-BY’ or ‘CC BY’??
+    #
+    pattern = re.compile(r"CC(\s|-)?BY(\s|-)?4.0", flags=re.I)
+    return bool(pattern.match(field_value))
+
+
+def parse_subset_856(datafields_856):
+    at_least_one_found = False
+    for datafield in datafields_856:
+        subfield = datafield.find("subfield[@code='y']")
+        try:
+            is_subfield_y_wanted_value = is_correct_value(subfield)
+            if not at_least_one_found:
+                at_least_one_found = is_subfield_y_wanted_value
+            at_least_one_found = is_subfield_y_wanted_value
+        except AttributeError:
+            pass
+    return at_least_one_found
+
+
+def parse_subset_540_preprint(datafields_540):
+    at_least_one_found = False
+    for datafield in datafields_540:
+        subfield_3 = datafield.find("subfield[@code='3']")
+        try:
+            is_subfield_3_wanted_value = subfield_3.text.lower() == "preprint"
+            if not at_least_one_found:
+                at_least_one_found = is_subfield_3_wanted_value
+        except AttributeError:
+            pass
+    return at_least_one_found
+
+
+def parse_subset_540_publication(datafields_540):
+    at_least_one_found = False
+    for datafield in datafields_540:
+        subfield_3 = datafield.find("subfield[@code='3']")
+        subfield_a = datafield.find("subfield[@code='a']")
+        try:
+            is_subfield_wanted_3_value = subfield_3.text.lower() == "publication"
+            is_subfield_a_wanted_value = field_has_cc_by(subfield_a.text)
+            if not at_least_one_found:
+                at_least_one_found = bool(
+                    is_subfield_wanted_3_value and is_subfield_a_wanted_value
+                )
+        except AttributeError:
+            pass
+    return at_least_one_found
+
+
+def parse_subset_green_access(records):
+    filtered_records = []
+    for record in records:
+        datafields_856 = record.findall("datafield[@tag='856'][@ind1='4'][@ind2=' ']")
+        datafields_540 = record.findall("datafield/[@tag='540']")
+        if datafields_856 is None:
+            continue
+        if datafields_540 is None:
+            continue
+        is_it_wanted_record_by_856 = parse_subset_856(datafields_856)
+        is_it_wanted_record_by_540_preprint = parse_subset_540_preprint(datafields_540)
+        is_it_wanted_record_by_540_publication = not parse_subset_540_publication(
+            datafields_540
+        )
+
+        if (
+            is_it_wanted_record_by_856
+            or is_it_wanted_record_by_540_preprint
+            or is_it_wanted_record_by_540_publication
+        ):
+            filtered_records.append(record)
+
+    return filtered_records
+
+
+def parse_subset_golden_access(records):
+    filtered_records = []
     for record in records:
-        datafields = record.findall("datafield/[@tag='540']")
-        if datafields is None:
+        datafields_540 = record.findall("datafield/[@tag='540']")
+        if datafields_540 is None:
             continue
-        for datafield in datafields:
-            record_type = datafield.find("subfield/[@code='3']")
-            license = datafield.find("subfield/[@code='a']")
-            if record_type is not None and license is not None:
-                if (
-                    "CC" in license.text
-                    and "BY" in license.text
-                    and record_type.text == "publication"
-                ):
-                    record_id = record.find("controlfield/[@tag='001']")
-                    if record_id is not None:
-                        doi = record_id.text
-                        golden_access.append(doi)
-    return golden_access
+        is_it_wanted_record_by_540_publication = parse_subset_540_publication(
+            datafields_540
+        )
+
+        if is_it_wanted_record_by_540_publication:
+            filtered_records.append(record)
+    return filtered_records
+
+
+def get_records_ids(data, record_filter):
+    xml = parse_without_names_spaces(data)
+    records = xml.findall(".record")
+    filtered_records = record_filter(records)
+    green_access = []
+    for record in filtered_records:
+        record_id = record.find("controlfield/[@tag='001']")
+        if record_id is not None:
+            doi = record_id.text
+            green_access.append(doi)
+    return green_access
+
+
+def get_golden_access_records_ids(data):
+    return get_records_ids(data, parse_subset_golden_access)
+
+
+def get_green_access_records_ids(data):
+    return get_records_ids(data, parse_subset_green_access)
diff --git a/dags/open_access/utils.py b/dags/open_access/utils.py
@@ -2,23 +2,30 @@
 import math
 
 from common.utils import request_again_if_failed
-from open_access.parsers import get_golden_access_records_ids
+from open_access.parsers import (get_golden_access_records_ids,
+                                 get_green_access_records_ids)
 
 
-def get_gold_access_count(total, url):
+def get_count(total, url, record_extractor):
     iterations = math.ceil(total / 100.0)
     records_ids_count = 0
     for i in range(0, iterations):
         jrec = (i * 100) + 1
         full_url = f"{url}&jrec={jrec}"
         response = request_again_if_failed(full_url)
-        records_ids_count = records_ids_count + len(
-            get_golden_access_records_ids(response.text)
-        )
+        records_ids_count = records_ids_count + len(record_extractor(response.text))
     logging.info(f"In total was found {records_ids_count} golden access records")
     return records_ids_count
 
 
+def get_golden_access_count(total, url):
+    return get_count(total, url, get_golden_access_records_ids)
+
+
+def get_green_access_count(total, url):
+    return get_count(total, url, get_green_access_records_ids)
+
+
 def get_url(query, current_collection="Published+Articles"):
     url = (
         rf"https://cds.cern.ch/search?ln=en&cc={current_collection}&p={query}"

diff --git a/tests/open_access/test_parser.py b/tests/open_access/test_parser.py
@@ -1,6 +1,10 @@
-from open_access.parsers import get_golden_access_records_ids
+from open_access.parsers import (get_golden_access_records_ids,
+                                 get_green_access_records_ids,
+                                 parse_subset_540_preprint,
+                                 parse_subset_540_publication, parse_subset_856,
+                                 parse_without_names_spaces)
 
-expected = [
+expected_golden = [
     "2894668",
     "2891488",
     "2888511",
@@ -11,6 +15,28 @@
     "2882429",
     "2882335",
     "2882328",
+    "2882324",
+    "2882322",
+    "2882311",
+    "2882298",
+]
+
+expected_green = [
+    "2894668",
+    "2891489",
+    "2891488",
+    "2891487",
+    "2888511",
+    "2888151",
+    "2886038",
+    "2884472",
+    "2884471",
+    "2884470",
+    "2884469",
+    "2883672",
+    "2882429",
+    "2882335",
+    "2882328",
     "2882327",
     "2882324",
     "2882322",
@@ -22,4 +48,55 @@
 def test_get_golden_access_records_dois(shared_datadir):
     with open(shared_datadir / "search.xml") as file:
         records_ids = get_golden_access_records_ids(file.read())
-        assert records_ids == expected
+        assert records_ids == expected_golden
+
+
+def test_parse_subset_856(shared_datadir):
+    with open(shared_datadir / "search.xml") as file:
+        filtered_records_count = 0
+        parsed_records = parse_without_names_spaces(file.read())
+        records = parsed_records.findall(".record")
+        for record in records:
+            datafields_856 = record.findall(
+                "datafield[@tag='856'][@ind1='4'][@ind2=' ']"
+            )
+            is_it_wanted_record_by_856 = parse_subset_856(datafields_856)
+            if is_it_wanted_record_by_856:
+                filtered_records_count = filtered_records_count + 1
+        assert filtered_records_count == 0
+
+
+def test_parse_subset_540_preprint(shared_datadir):
+    with open(shared_datadir / "search.xml") as file:
+        filtered_records_count = 0
+        parsed_records = parse_without_names_spaces(file.read())
+        records = parsed_records.findall(".record")
+        for record in records:
+            datafields_540 = record.findall(
+                "datafield[@tag='540'][@ind1=' '][@ind2=' ']"
+            )
+            is_it_wanted_record_by_540 = parse_subset_540_preprint(datafields_540)
+            if is_it_wanted_record_by_540:
+                filtered_records_count = filtered_records_count + 1
+        assert filtered_records_count == 20
+
+
+def test_parse_subset_540_publications(shared_datadir):
+    with open(shared_datadir / "search.xml") as file:
+        filtered_records_count = 0
+        parsed_records = parse_without_names_spaces(file.read())
+        records = parsed_records.findall(".record")
+        for record in records:
+            datafields_540 = record.findall(
+                "datafield[@tag='540'][@ind1=' '][@ind2=' ']"
+            )
+            is_it_wanted_record_by_540 = parse_subset_540_publication(datafields_540)
+            if is_it_wanted_record_by_540:
+                filtered_records_count = filtered_records_count + 1
+        assert filtered_records_count == 14
+
+
+def test_get_green_access_records_dois(shared_datadir):
+    with open(shared_datadir / "search.xml") as file:
+        records_ids = get_green_access_records_ids(file.read())
+        assert records_ids == expected_green