diff --git a/dags/open_access/open_access.py b/dags/open_access/open_access.py index 5757352..dc9ac5d 100644 --- a/dags/open_access/open_access.py +++ b/dags/open_access/open_access.py @@ -51,12 +51,13 @@ def fetch_count(parameters): ) type_of_query = parameters["type_of_query"] endpoint = parameters["endpoint"] - total = get_total_results_count(response.text) - if type_of_query == "gold": - total = utils.get_golden_access_count(total, endpoint) - if type_of_query == "green": - total = utils.get_green_access_count(total, endpoint) count = get_total_results_count(response.text) + if type_of_query == "gold_open_access": + total_gold = utils.get_golden_access_count(count, endpoint) + return {parameters["type_of_query"]: total_gold} + elif type_of_query == "green_open_access": + total_green = utils.get_green_access_count(count, endpoint) + return {parameters["type_of_query"]: total_green} return {parameters["type_of_query"]: count} queries_objects_list = [ diff --git a/dags/open_access/parsers.py b/dags/open_access/parsers.py index 7230322..5b82694 100644 --- a/dags/open_access/parsers.py +++ b/dags/open_access/parsers.py @@ -89,12 +89,11 @@ def parse_subset_green_access(records): is_it_wanted_record_by_540_publication = ( not is_subset_540_publication_golden_access(datafields_540) ) + green_access_by_field = ( + is_it_wanted_record_by_856 or is_it_wanted_record_by_540_preprint + ) - if ( - is_it_wanted_record_by_856 - or is_it_wanted_record_by_540_preprint - or is_it_wanted_record_by_540_publication - ): + if green_access_by_field and is_it_wanted_record_by_540_publication: filtered_records.append(record) return filtered_records @@ -106,11 +105,7 @@ def parse_subset_golden_access(records): datafields_540 = record.findall("datafield/[@tag='540']") if datafields_540 is None: continue - is_it_wanted_record_by_540_publication = ( - is_subset_540_publication_golden_access(datafields_540) - ) - - if is_it_wanted_record_by_540_publication: + if is_subset_540_publication_golden_access(datafields_540): filtered_records.append(record) return filtered_records diff --git a/dags/open_access/utils.py b/dags/open_access/utils.py index 5a04274..0da1d74 100644 --- a/dags/open_access/utils.py +++ b/dags/open_access/utils.py @@ -14,6 +14,7 @@ def get_count_http_hook(total, url, record_extractor): http_hook = HttpHook(http_conn_id="cds", method="GET") iterations = math.ceil(total / 100.0) records_ids_count = 0 + all_ids = [] for i in range(0, iterations): jrec = (i * 100) + 1 full_url = f"{url}&jrec={jrec}" @@ -24,7 +25,9 @@ def get_count_http_hook(total, url, record_extractor): "retry": retry_if_exception_type(AirflowException), }, ) + all_ids.extend(record_extractor(response.text)) records_ids_count = records_ids_count + len(record_extractor(response.text)) + print(all_ids) logging.info(f"In total was found {records_ids_count} golden access records") return records_ids_count diff --git a/tests/open_access/test_parser.py b/tests/open_access/test_parser.py index f2dc7c7..fdfa68e 100644 --- a/tests/open_access/test_parser.py +++ b/tests/open_access/test_parser.py @@ -25,28 +25,7 @@ "2882298", ] -expected_green = [ - "2894668", - "2891489", - "2891488", - "2891487", - "2888511", - "2888151", - "2886038", - "2884472", - "2884471", - "2884470", - "2884469", - "2883672", - "2882429", - "2882335", - "2882328", - "2882327", - "2882324", - "2882322", - "2882311", - "2882298", -] +expected_green = ["2891489", "2891487", "2886038", "2884472", "2884469"] def test_get_golden_access_records_dois(shared_datadir):