Skip to content

Commit

Permalink
Merge pull request #77 from reliztrent/master
Browse files Browse the repository at this point in the history
add us-elections package files
  • Loading branch information
beefoo authored Nov 20, 2024
2 parents ac68bce + 5a327df commit 267a2d0
Show file tree
Hide file tree
Showing 4 changed files with 2,627 additions and 20 deletions.
4 changes: 2 additions & 2 deletions Data Packages/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ Each data package has a Jupyter notebook demonstrating how to access and use tha
- [Digitized Books Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/digitized-books.ipynb)
- [Directory Holdings Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/directories.ipynb)
- [Telephone Directories Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/telephone.ipynb)
- [Selected Dot Gov Media Types, Web Archives Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/dot-gov-pdf.ipynb)
- [Stereograph Card Images Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/stereographs.ipynb)
- [Austro-Hungarian Map Set Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/austro-hungarian-maps.ipynb)
- [General Collection Assessment Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/gen-coll-assessment.ipynb)


- [United States Elections, Web Archives Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/us-elections.ipynb)
223 changes: 205 additions & 18 deletions Data Packages/helpers.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,227 @@
import os
import requests
import sys
import time


def get_file_stats(files):

# For keeping track of files, grouped by file type
ftypes = {}

# Loop through files
for file in files:

filename, file_extension = os.path.splitext(file['object_key'])
filename, file_extension = os.path.splitext(file["object_key"])

# Skip directories
if file_extension == '':
if file_extension == "":
continue

# Keep track of count and size
if file_extension not in ftypes:
ftypes[file_extension] = {'Count': 0, 'Size': 0}
ftypes[file_extension]['Count'] += 1
ftypes[file_extension]['Size'] += file['size']
ftypes[file_extension] = {"Count": 0, "Size": 0}
ftypes[file_extension]["Count"] += 1
ftypes[file_extension]["Size"] += file["size"]

# Convert to list, sort file types by total size
stats = [{'FileType': ext, 'Count': ftypes[ext]['Count'], 'Size': ftypes[ext]['Size']} for ext in ftypes]
stats = sorted(stats, key=lambda s: -s['Size'])
stats = [
{"FileType": ext, "Count": ftypes[ext]["Count"], "Size": ftypes[ext]["Size"]}
for ext in ftypes
]
stats = sorted(stats, key=lambda s: -s["Size"])

# Format numbers
for i, s in enumerate(stats):
stats[i]['Count'] = f'{s["Count"]:,}'
kb = round(s["Size"]/1000.0)
mb = round(s["Size"]/1000000.0)
gb = round(s["Size"]/1000000000.0, 2)
size = f'{kb:,}KB'
stats[i]["Count"] = f'{s["Count"]:,}'
kb = round(s["Size"] / 1000.0)
mb = round(s["Size"] / 1000000.0)
gb = round(s["Size"] / 1000000000.0, 2)
size = f"{kb:,}KB"
if gb > 1:
size = f'{gb:,}GB'
size = f"{gb:,}GB"
elif mb > 1:
size = f'{mb:,}MB'
stats[i]['Size'] = size
size = f"{mb:,}MB"
stats[i]["Size"] = size

return stats


def make_request(
url: str,
params={},
headers={},
max_attempts=10,
timeout=60,
pause=5,
locgov_json=False,
json=False,
is_blocked=False,
verbose=False,
):
"""
Params:
- url (str): URL to request.
- params (dict): Dictionary of parameters to pass to requests.get(). Used mostly
for API requests.
- headers (dict): HTTP headers to pass with request, such as User-Agent.
- max_attempts (int): Maximum number of attempts before returning a general
error. Default = 10.
- timeout (int): Number of seconds before requets.get() times out.
- pause (int): Number of baseline seconds between requests. This will increase
on retries.
- locgov_json (bool): True means that the response is a loc.gov JSON record
- other_json (bool): True means that the response is a JSON record (this is
ignored if locgov_json = True)
- is_blocked (bool): True means that the server has already returned a 429.
This is for use in loops where you'd like to hault all requests in the
event of a 429 status code.
- requested_fields (list): If you would like to output only certain fields,
list them here, e.g., ['aka','item.formats'] Default becomes {}. Only
works if json or locgov_json is True and response is a dictionary.
Returns:
- is_blocked. bool.
- result. JSON-like object if locgov_json or json are True; binary otherwise;
or one of string error messages listed below.
Error handling:
Pause and retry:
- Network/DNS issue (requests.get() error)
- status_code 5##
- loc.gov JSON with 'status' 5##
- loc.gov JSON with 'status' 4## (1 retry only)
- status code 404 if locgov_json = True (1 retry only)
Returns `False, "ERROR - NO RECORD"`:
- status code 404
Returns `False, 'ERROR - INVALID JSON'`:
- invalid loc.gov JSON
- invalid JSON
Returns `False,'ERROR - GENERAL'`:
- on final attempt, returns 5## status_code or loc.gov JSON 'status'.
- on final attempt, requests.get() error occurs
- status code 403
Returns `True, 'ERROR - BLOCKED'`:
- status_code 429 (too many requests, blocked by server)
"""
i = 0
no_record = False, "ERROR - NO RECORD" # value to return for URLs that don't exist
invalid = False, "ERROR - INVALID JSON" # value to return for invalid JSON
error = False, "ERROR - GENERAL" # value to return for all other errors
blocked = True, "ERROR - BLOCKED" # value to return after receiving a 429
if is_blocked is True:
print("Blocked due to too many requests. Skipping {url} {params}")
return error
while i < max_attempts:
iter_pause = pause * (i + 1)
if verbose is True:
retry_msg = f" Trying again in {iter_pause} seconds . . ."
if i > 0:
print(retry_msg)
time.sleep(iter_pause)
if verbose is True:
message = f"Making request. Attempt #{i+1} for: {url} {params}"
print(message)

try:
response = requests.get(
url, params=params, timeout=timeout, headers=headers
)
except ConnectionError as e:
print(f"Connection error ({e}): {url} {params}.")
i += 1
continue
except Exception as e:
print(f"requests.get() failed ({e}): {url} {params}.")
i += 1
continue

# 429 too many requests
if response.status_code == 429:
print(f"Too many requests (429). Skipping: {url} {params}.")
return blocked

# 500 - 599 server error
elif (500 <= response.status_code) & (600 > response.status_code):
print(f"Server error ({response.status_code}): {url} {params}.")
i += 1
continue

# 403 forbidden
elif response.status_code == 403:
print(f"Forbidden request (403). Skipping: {url} {params}.")
return error

# 404 doesn't exist
elif response.status_code == 404:
if (i <= 2) and (locgov_json is True):
if verbose is True:
print(
f"Received 404 status_code (locgov_json is True, trying another time): {url} {params}."
)
i += 1
continue
else:
print(f"Resource does not exist (404). Skipping: {url} {params}.")
return no_record

# Verify JSON (if applicable)
if locgov_json is True:
try:
output = response.json()
status = str(output.get("status"))

# loc.gov JSON 4##
if status.startswith("4"):
if i > 2: # only makes two attempts
message = f"Resource does not exist (loc.gov {status}). Skipping: {url} {params}."
print(message)
return False, no_record
else:
if verbose is True:
print("Received loc.gov JSON 404: {url} {params}")
i += 1
continue

# loc.gov JSON 5##
elif status.startswith("5"):
if verbose is True:
message = (
f"Server error ({status}). Request for {url} {params}."
)
print(message)
i += 1
continue

# loc.gov valid JSON
else:
# Success, return the loc.gov JSON record
if verbose is True:
print(f"Successfull request (loc.gov JSON): {url} {params}")
# Proceed to RETURN SUCCESSFUL RESULT

# loc.gov invalid JSON
except Exception as e:
print(f"INVALID JSON ({e}): {url} {params}")
return invalid

elif json is True:
try:
output = response.json()
if verbose is True:
print(f"Successfull request (JSON): {url} {params}")
# Proceed to RETURN SUCCESSFUL RESULT
except Exception as e:
print(f"INVALID JSON ({e}): {url} {params}")
return invalid

else:
if verbose is True:
print(f"Successfull request: {url} {params}")
# Proceed to RETURN SUCCESSFUL RESULT

# RETURN SUCCESSFUL RESULT
return False, output

return stats
# If hits max attempts, return general error
return error
6 changes: 6 additions & 0 deletions Data Packages/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ scipy
# For printing data tables
pandas

# For data analysis
scikit-learn

# For reading and manipulating images
Pillow

Expand All @@ -31,5 +34,8 @@ tqdm
# For generating wordclouds
wordcloud

# For parsing XML and HTML
beautifulsoup4

# For reading, parsing PDFs
pypdf
Loading

0 comments on commit 267a2d0

Please sign in to comment.