-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from BU-Spark/dan_dev
api data loading script
- Loading branch information
Showing
2 changed files
with
216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import json | ||
import requests | ||
import sys | ||
import logging | ||
|
||
# Initialize logging | ||
logging.basicConfig(filename='text_load.log', level=logging.INFO, | ||
format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
i_begin = int(sys.argv[1]) # Convert to integer | ||
i_end = int(sys.argv[2]) | ||
|
||
def load_full_text(i_begin,i_end): | ||
|
||
file_path_write = '/projectnb/sparkgrp/ml-bpl-rag-data/text/' | ||
counter = 0 | ||
text_counter = 0 | ||
# Function to get the full text | ||
def get_text(url): | ||
url = url + '/text' | ||
try: | ||
response = requests.get(url, timeout=10) # Timeout to prevent hanging | ||
if response.status_code == 200: | ||
return response.text | ||
else: | ||
logging.warning(f"Error {response.status_code} for URL {url}") | ||
return None | ||
except requests.exceptions.RequestException as e: | ||
logging.error(f"Request failed for URL {url}: {e}") | ||
return None | ||
|
||
full_text_locate = {} | ||
|
||
# Load metadata | ||
with open("/projectnb/sparkgrp/ml-bpl-rag-data/bpl_data.json", 'r') as f: | ||
bpl_meta = json.load(f) | ||
|
||
# Ensure valid range | ||
if i_begin < 0: | ||
i_begin = 0 | ||
if (i_end > (len(bpl_meta['Data']) - 1)) or (i_end < i_begin): | ||
i_end = len(bpl_meta['Data']) - 1 | ||
try: | ||
# Process the items within the specified range | ||
for item in list(bpl_meta['Data'])[i_begin:i_end]: | ||
if ('has_transcription_bsi' in item['attributes']) and ('identifier_uri_ss' in item['attributes']): | ||
full_text_locate[item['id']] = { | ||
'text': get_text(item['attributes']['identifier_uri_ss']) | ||
} | ||
text_counter += 1 | ||
counter += 1 # Increment counter for every processed item | ||
|
||
# Save checkpoint every 3000 items | ||
if counter % 50000 == 0: | ||
with open(f'{file_path_write}ft_{i_end//100000}_checkpoint_{str(counter // 50000)}_{text_counter}.json', 'w') as check: | ||
json.dump(full_text_locate, check) | ||
full_text_locate.clear() # Clear the dictionary to free memory | ||
text_counter = 0 | ||
|
||
except Exception as e: | ||
with open(f'{file_path_write}checkpoint_interrupted.json', 'w') as check: | ||
json.dump(full_text_locate, check) | ||
logging.error(f"Process interrupted: {e}") | ||
|
||
# Save final checkpoint | ||
with open(f'{file_path_write}ft_{i_end//100000}_checkpoint_end_{text_counter}.json', 'w') as check: | ||
json.dump(full_text_locate, check) | ||
|
||
print(f"Checked in {counter} texts") | ||
|
||
if __name__ == "__main__": | ||
load_full_text(i_begin,i_end) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
|
||
|
||
import json | ||
|
||
import time | ||
|
||
import os | ||
|
||
import sys | ||
|
||
import requests | ||
|
||
|
||
|
||
|
||
|
||
def fetch_digital_commonwealth(): | ||
|
||
start = time.time() | ||
|
||
BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q=" | ||
|
||
PAGE = sys.argv[1] | ||
|
||
END_PAGE = sys.argv[2] | ||
|
||
file_name = f"out{PAGE}_{END_PAGE}.json" | ||
|
||
FINAL_PAGE = 13038 | ||
|
||
output = [] | ||
|
||
file_path = f"/projectnb/sparkgrp/ml-bpl-rag-data/{file_name}" | ||
|
||
# file_path = './output.json' | ||
|
||
if os.path.exists(file_path): | ||
|
||
with open(file_path,'r') as file: | ||
|
||
output = json.load(file) | ||
|
||
if int(PAGE) < (len(output) + 1): | ||
|
||
PAGE = len(output) + 1 | ||
|
||
|
||
|
||
if int(PAGE) >= int(END_PAGE): | ||
|
||
return None | ||
|
||
print(f'Reading page {PAGE} up to page {END_PAGE}') | ||
|
||
retries = 0 | ||
|
||
while True: | ||
|
||
try: | ||
|
||
response = requests.get(f"{BASE_URL}&page={PAGE}") | ||
|
||
response.raise_for_status() | ||
|
||
data = response.json() | ||
|
||
|
||
|
||
# Append current page data to the output list | ||
|
||
output.append(data) | ||
|
||
|
||
|
||
# Save the entire output to a JSON file after each iteration | ||
|
||
with open(file_path, 'w') as f: | ||
|
||
json.dump(output, f) | ||
|
||
|
||
|
||
|
||
|
||
# check if theres a next page | ||
|
||
# print(len(response)) | ||
|
||
if data['meta']['pages']['next_page']: | ||
|
||
if data['meta']['pages']['next_page'] == int(END_PAGE): | ||
|
||
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") | ||
|
||
break | ||
|
||
elif data['meta']['pages']['next_page'] == FINAL_PAGE: | ||
|
||
print(f"finished page {PAGE}") | ||
|
||
PAGE = FINAL_PAGE | ||
|
||
else: | ||
|
||
print(f"finished page {PAGE}") | ||
|
||
PAGE = data['meta']['pages']['next_page'] | ||
|
||
else: | ||
|
||
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") | ||
|
||
break | ||
|
||
|
||
|
||
retries = 0 | ||
|
||
# Optional: Add a small delay to avoid overwhelming the API | ||
|
||
# time.sleep(0.5) | ||
|
||
except requests.exceptions.RequestException as e: | ||
|
||
print(f"An error occurred: {e}") | ||
|
||
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}") | ||
|
||
retries += 1 | ||
|
||
if retries >= 5: | ||
|
||
break | ||
|
||
end = time.time() | ||
|
||
print(f"Timer: {end - start}") | ||
|
||
print(f"Finished processing all pages. Total pages saved: {len(output)}") | ||
|
||
if __name__ == "__main__": | ||
|
||
fetch_digital_commonwealth() |