Skip to content

Commit

Permalink
OA: green access parsing fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Jul 31, 2024
1 parent 1ba37db commit 0554a08
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 37 deletions.
11 changes: 6 additions & 5 deletions dags/open_access/open_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ def fetch_count(parameters):
)
type_of_query = parameters["type_of_query"]
endpoint = parameters["endpoint"]
total = get_total_results_count(response.text)
if type_of_query == "gold":
total = utils.get_golden_access_count(total, endpoint)
if type_of_query == "green":
total = utils.get_green_access_count(total, endpoint)
count = get_total_results_count(response.text)
if type_of_query == "gold_open_access":
total_gold = utils.get_golden_access_count(count, endpoint)
return {parameters["type_of_query"]: total_gold}
elif type_of_query == "green_open_access":
total_green = utils.get_green_access_count(count, endpoint)
return {parameters["type_of_query"]: total_green}
return {parameters["type_of_query"]: count}

queries_objects_list = [
Expand Down
15 changes: 5 additions & 10 deletions dags/open_access/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,11 @@ def parse_subset_green_access(records):
is_it_wanted_record_by_540_publication = (
not is_subset_540_publication_golden_access(datafields_540)
)
green_access_by_field = (
is_it_wanted_record_by_856 or is_it_wanted_record_by_540_preprint
)

if (
is_it_wanted_record_by_856
or is_it_wanted_record_by_540_preprint
or is_it_wanted_record_by_540_publication
):
if green_access_by_field and is_it_wanted_record_by_540_publication:
filtered_records.append(record)

return filtered_records
Expand All @@ -106,11 +105,7 @@ def parse_subset_golden_access(records):
datafields_540 = record.findall("datafield/[@tag='540']")
if datafields_540 is None:
continue
is_it_wanted_record_by_540_publication = (
is_subset_540_publication_golden_access(datafields_540)
)

if is_it_wanted_record_by_540_publication:
if is_subset_540_publication_golden_access(datafields_540):
filtered_records.append(record)
return filtered_records

Expand Down
2 changes: 2 additions & 0 deletions dags/open_access/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def get_count_http_hook(total, url, record_extractor):
http_hook = HttpHook(http_conn_id="cds", method="GET")
iterations = math.ceil(total / 100.0)
records_ids_count = 0
all_ids = []
for i in range(0, iterations):
jrec = (i * 100) + 1
full_url = f"{url}&jrec={jrec}"
Expand All @@ -24,6 +25,7 @@ def get_count_http_hook(total, url, record_extractor):
"retry": retry_if_exception_type(AirflowException),
},
)
all_ids.extend(record_extractor(response.text))
records_ids_count = records_ids_count + len(record_extractor(response.text))
logging.info(f"In total was found {records_ids_count} golden access records")
return records_ids_count
Expand Down
23 changes: 1 addition & 22 deletions tests/open_access/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,7 @@
"2882298",
]

expected_green = [
"2894668",
"2891489",
"2891488",
"2891487",
"2888511",
"2888151",
"2886038",
"2884472",
"2884471",
"2884470",
"2884469",
"2883672",
"2882429",
"2882335",
"2882328",
"2882327",
"2882324",
"2882322",
"2882311",
"2882298",
]
expected_green = ["2891489", "2891487", "2886038", "2884472", "2884469"]


def test_get_golden_access_records_dois(shared_datadir):
Expand Down

0 comments on commit 0554a08

Please sign in to comment.