Skip to content

Commit

Permalink
Merge pull request #3 from BU-Spark/dan_dev
Browse files Browse the repository at this point in the history
api data loading script
  • Loading branch information
tanisqmallpani23 authored Oct 21, 2024
2 parents 219a466 + 6beb9cf commit 38953d4
Show file tree
Hide file tree
Showing 2 changed files with 216 additions and 0 deletions.
73 changes: 73 additions & 0 deletions load_ft_ch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import json
import requests
import sys
import logging

# Initialize logging
logging.basicConfig(filename='text_load.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')

i_begin = int(sys.argv[1]) # Convert to integer
i_end = int(sys.argv[2])

def load_full_text(i_begin,i_end):

file_path_write = '/projectnb/sparkgrp/ml-bpl-rag-data/text/'
counter = 0
text_counter = 0
# Function to get the full text
def get_text(url):
url = url + '/text'
try:
response = requests.get(url, timeout=10) # Timeout to prevent hanging
if response.status_code == 200:
return response.text
else:
logging.warning(f"Error {response.status_code} for URL {url}")
return None
except requests.exceptions.RequestException as e:
logging.error(f"Request failed for URL {url}: {e}")
return None

full_text_locate = {}

# Load metadata
with open("/projectnb/sparkgrp/ml-bpl-rag-data/bpl_data.json", 'r') as f:
bpl_meta = json.load(f)

# Ensure valid range
if i_begin < 0:
i_begin = 0
if (i_end > (len(bpl_meta['Data']) - 1)) or (i_end < i_begin):
i_end = len(bpl_meta['Data']) - 1
try:
# Process the items within the specified range
for item in list(bpl_meta['Data'])[i_begin:i_end]:
if ('has_transcription_bsi' in item['attributes']) and ('identifier_uri_ss' in item['attributes']):
full_text_locate[item['id']] = {
'text': get_text(item['attributes']['identifier_uri_ss'])
}
text_counter += 1
counter += 1 # Increment counter for every processed item

# Save checkpoint every 3000 items
if counter % 50000 == 0:
with open(f'{file_path_write}ft_{i_end//100000}_checkpoint_{str(counter // 50000)}_{text_counter}.json', 'w') as check:
json.dump(full_text_locate, check)
full_text_locate.clear() # Clear the dictionary to free memory
text_counter = 0

except Exception as e:
with open(f'{file_path_write}checkpoint_interrupted.json', 'w') as check:
json.dump(full_text_locate, check)
logging.error(f"Process interrupted: {e}")

# Save final checkpoint
with open(f'{file_path_write}ft_{i_end//100000}_checkpoint_end_{text_counter}.json', 'w') as check:
json.dump(full_text_locate, check)

print(f"Checked in {counter} texts")

if __name__ == "__main__":
load_full_text(i_begin,i_end)

143 changes: 143 additions & 0 deletions load_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@


import json

import time

import os

import sys

import requests





def fetch_digital_commonwealth():

start = time.time()

BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="

PAGE = sys.argv[1]

END_PAGE = sys.argv[2]

file_name = f"out{PAGE}_{END_PAGE}.json"

FINAL_PAGE = 13038

output = []

file_path = f"/projectnb/sparkgrp/ml-bpl-rag-data/{file_name}"

# file_path = './output.json'

if os.path.exists(file_path):

with open(file_path,'r') as file:

output = json.load(file)

if int(PAGE) < (len(output) + 1):

PAGE = len(output) + 1



if int(PAGE) >= int(END_PAGE):

return None

print(f'Reading page {PAGE} up to page {END_PAGE}')

retries = 0

while True:

try:

response = requests.get(f"{BASE_URL}&page={PAGE}")

response.raise_for_status()

data = response.json()



# Append current page data to the output list

output.append(data)



# Save the entire output to a JSON file after each iteration

with open(file_path, 'w') as f:

json.dump(output, f)





# check if theres a next page

# print(len(response))

if data['meta']['pages']['next_page']:

if data['meta']['pages']['next_page'] == int(END_PAGE):

print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")

break

elif data['meta']['pages']['next_page'] == FINAL_PAGE:

print(f"finished page {PAGE}")

PAGE = FINAL_PAGE

else:

print(f"finished page {PAGE}")

PAGE = data['meta']['pages']['next_page']

else:

print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")

break



retries = 0

# Optional: Add a small delay to avoid overwhelming the API

# time.sleep(0.5)

except requests.exceptions.RequestException as e:

print(f"An error occurred: {e}")

print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")

retries += 1

if retries >= 5:

break

end = time.time()

print(f"Timer: {end - start}")

print(f"Finished processing all pages. Total pages saved: {len(output)}")

if __name__ == "__main__":

fetch_digital_commonwealth()

0 comments on commit 38953d4

Please sign in to comment.