Merge pull request #3 from BU-Spark/dan_dev

api data loading script
BU-Spark · Oct 21, 2024 · 38953d4 · 38953d4
2 parents 219a466 + 6beb9cf
commit 38953d4
Show file tree

Hide file tree

Showing 2 changed files with 216 additions and 0 deletions.
diff --git a/load_ft_ch.py b/load_ft_ch.py
@@ -0,0 +1,73 @@
+import json
+import requests
+import sys
+import logging
+
+# Initialize logging
+logging.basicConfig(filename='text_load.log', level=logging.INFO, 
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
+i_begin = int(sys.argv[1])  # Convert to integer
+i_end = int(sys.argv[2])
+
+def load_full_text(i_begin,i_end):
+
+    file_path_write = '/projectnb/sparkgrp/ml-bpl-rag-data/text/'
+    counter = 0
+    text_counter = 0
+    # Function to get the full text
+    def get_text(url):
+        url = url + '/text'
+        try:
+            response = requests.get(url, timeout=10)  # Timeout to prevent hanging
+            if response.status_code == 200:
+                return response.text
+            else:
+                logging.warning(f"Error {response.status_code} for URL {url}")
+                return None
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Request failed for URL {url}: {e}")
+            return None
+
+    full_text_locate = {}
+
+    # Load metadata
+    with open("/projectnb/sparkgrp/ml-bpl-rag-data/bpl_data.json", 'r') as f:
+        bpl_meta = json.load(f)
+
+    # Ensure valid range
+    if i_begin < 0:
+        i_begin = 0
+    if (i_end > (len(bpl_meta['Data']) - 1)) or (i_end < i_begin):
+        i_end = len(bpl_meta['Data']) - 1    
+    try:
+        # Process the items within the specified range
+        for item in list(bpl_meta['Data'])[i_begin:i_end]:
+            if ('has_transcription_bsi' in item['attributes']) and ('identifier_uri_ss' in item['attributes']):
+                full_text_locate[item['id']] = {
+                    'text': get_text(item['attributes']['identifier_uri_ss'])
+                }
+                text_counter += 1
+            counter += 1  # Increment counter for every processed item
+
+            # Save checkpoint every 3000 items
+            if counter % 50000 == 0:
+                with open(f'{file_path_write}ft_{i_end//100000}_checkpoint_{str(counter // 50000)}_{text_counter}.json', 'w') as check:
+                    json.dump(full_text_locate, check)
+                full_text_locate.clear()  # Clear the dictionary to free memory
+                text_counter = 0
+
+    except Exception as e:
+        with open(f'{file_path_write}checkpoint_interrupted.json', 'w') as check:
+            json.dump(full_text_locate, check)
+        logging.error(f"Process interrupted: {e}")
+
+    # Save final checkpoint
+    with open(f'{file_path_write}ft_{i_end//100000}_checkpoint_end_{text_counter}.json', 'w') as check:
+        json.dump(full_text_locate, check)
+
+    print(f"Checked in {counter} texts")
+
+if __name__ == "__main__":
+    load_full_text(i_begin,i_end)
+
diff --git a/load_script.py b/load_script.py
@@ -0,0 +1,143 @@
+
+
+import json
+
+import time
+
+import os
+
+import sys
+
+import requests
+
+
+
+
+
+def fetch_digital_commonwealth():
+
+    start = time.time()
+
+    BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="
+
+    PAGE = sys.argv[1]
+
+    END_PAGE = sys.argv[2]
+
+    file_name = f"out{PAGE}_{END_PAGE}.json"
+
+    FINAL_PAGE = 13038
+
+    output = []
+
+    file_path = f"/projectnb/sparkgrp/ml-bpl-rag-data/{file_name}"
+
+    # file_path = './output.json'
+
+    if os.path.exists(file_path):
+
+        with open(file_path,'r') as file:
+
+            output = json.load(file)
+
+            if int(PAGE) < (len(output) + 1):
+
+                PAGE = len(output) + 1
+
+
+
+    if int(PAGE) >= int(END_PAGE):
+
+        return None
+
+    print(f'Reading page {PAGE} up to page {END_PAGE}')
+
+    retries = 0
+
+    while True:
+
+        try:
+
+            response = requests.get(f"{BASE_URL}&page={PAGE}")
+
+            response.raise_for_status()
+
+            data = response.json()
+
+
+
+            # Append current page data to the output list
+
+            output.append(data)
+
+
+
+            # Save the entire output to a JSON file after each iteration
+
+            with open(file_path, 'w') as f:
+
+                json.dump(output, f)
+
+
+
+
+
+            # check if theres a next page
+
+            # print(len(response))
+
+            if data['meta']['pages']['next_page']:
+
+                if data['meta']['pages']['next_page'] == int(END_PAGE):
+
+                    print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
+
+                    break
+
+                elif data['meta']['pages']['next_page'] == FINAL_PAGE:
+
+                    print(f"finished page {PAGE}")
+
+                    PAGE = FINAL_PAGE
+
+                else:
+
+                    print(f"finished page {PAGE}")
+
+                    PAGE = data['meta']['pages']['next_page']
+
+            else:
+
+                print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
+
+                break
+
+
+
+            retries = 0
+
+            # Optional: Add a small delay to avoid overwhelming the API
+
+            # time.sleep(0.5)
+
+        except requests.exceptions.RequestException as e:
+
+            print(f"An error occurred: {e}")
+
+            print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
+
+            retries += 1
+
+            if retries >= 5:
+
+                break
+
+    end = time.time()
+
+    print(f"Timer: {end - start}")
+
+    print(f"Finished processing all pages. Total pages saved: {len(output)}")
+
+if __name__ == "__main__":
+
+    fetch_digital_commonwealth()