From bee7debeefe5caf426394bfc3020901ede80a148 Mon Sep 17 00:00:00 2001 From: Kevin Greenman Date: Tue, 6 Jun 2023 18:35:30 -0400 Subject: [PATCH] include SUPP files in selection --- select_chem.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/select_chem.py b/select_chem.py index f70f76e..080302e 100644 --- a/select_chem.py +++ b/select_chem.py @@ -71,15 +71,29 @@ def patent_directory(patent_year, patent_link, data_dir): subdirectory = [ s for s in subdirectory if ".txt" not in s ] # don't consider any present .txt files - # subdirectory_zip = [s for s in subdirectory if ".zip" in s] - # TODO: also add ZIP files from *-SUPP directories from late 2010 onwards for item in subdirectory: subdirectory_zip = os.listdir(os.path.join(current_path, item)) for element_zip in subdirectory_zip: if element_zip.lower().endswith(".zip"): list_zip.append(os.path.join(current_path, item, element_zip)) + # There are also ZIP files in *-SUPP directories from late 2010 onwards + if int(patent_year) >= 2010: + if os.path.isdir(os.path.join(current_path + "-SUPP")): + current_path = os.path.join(current_path + "-SUPP") + list_path.append(current_path) + subdirectory = os.listdir(current_path) + subdirectory = [ + s for s in subdirectory if ".txt" not in s + ] # don't consider any present .txt files + + for item in subdirectory: + subdirectory_zip = os.listdir(os.path.join(current_path, item)) + for element_zip in subdirectory_zip: + if element_zip.lower().endswith(".zip"): + list_zip.append(os.path.join(current_path, item, element_zip)) + print("Step 1 Complete") return list_path, list_zip