Merge pull request #77 from reliztrent/master

add us-elections package files
LibraryOfCongress · Nov 20, 2024 · 267a2d0 · 267a2d0
2 parents ac68bce + 5a327df
commit 267a2d0
Show file tree

Hide file tree

Showing 4 changed files with 2,627 additions and 20 deletions.
diff --git a/Data Packages/README.md b/Data Packages/README.md
@@ -29,8 +29,8 @@ Each data package has a Jupyter notebook demonstrating how to access and use tha
 - [Digitized Books Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/digitized-books.ipynb)
 - [Directory Holdings Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/directories.ipynb)
 - [Telephone Directories Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/telephone.ipynb)
+- [Selected Dot Gov Media Types, Web Archives Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/dot-gov-pdf.ipynb)
 - [Stereograph Card Images Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/stereographs.ipynb)
 - [Austro-Hungarian Map Set Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/austro-hungarian-maps.ipynb)
 - [General Collection Assessment Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/gen-coll-assessment.ipynb)
-
-
+- [United States Elections, Web Archives Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/us-elections.ipynb)
diff --git a/Data Packages/helpers.py b/Data Packages/helpers.py
@@ -1,40 +1,227 @@
 import os
+import requests
+import sys
+import time
+
 
 def get_file_stats(files):
-    
+
     # For keeping track of files, grouped by file type
     ftypes = {}
 
     # Loop through files
     for file in files:
 
-        filename, file_extension = os.path.splitext(file['object_key'])
+        filename, file_extension = os.path.splitext(file["object_key"])
 
         # Skip directories
-        if file_extension == '':
+        if file_extension == "":
             continue
-        
+
         # Keep track of count and size
         if file_extension not in ftypes:
-            ftypes[file_extension] = {'Count': 0, 'Size': 0}
-        ftypes[file_extension]['Count'] += 1
-        ftypes[file_extension]['Size'] += file['size']
+            ftypes[file_extension] = {"Count": 0, "Size": 0}
+        ftypes[file_extension]["Count"] += 1
+        ftypes[file_extension]["Size"] += file["size"]
 
     # Convert to list, sort file types by total size
-    stats = [{'FileType': ext, 'Count': ftypes[ext]['Count'], 'Size': ftypes[ext]['Size']} for ext in ftypes]
-    stats = sorted(stats, key=lambda s: -s['Size'])
+    stats = [
+        {"FileType": ext, "Count": ftypes[ext]["Count"], "Size": ftypes[ext]["Size"]}
+        for ext in ftypes
+    ]
+    stats = sorted(stats, key=lambda s: -s["Size"])
 
     # Format numbers
     for i, s in enumerate(stats):
-        stats[i]['Count'] = f'{s["Count"]:,}'
-        kb = round(s["Size"]/1000.0)
-        mb = round(s["Size"]/1000000.0)
-        gb = round(s["Size"]/1000000000.0, 2)
-        size = f'{kb:,}KB'
+        stats[i]["Count"] = f'{s["Count"]:,}'
+        kb = round(s["Size"] / 1000.0)
+        mb = round(s["Size"] / 1000000.0)
+        gb = round(s["Size"] / 1000000000.0, 2)
+        size = f"{kb:,}KB"
         if gb > 1:
-            size = f'{gb:,}GB'
+            size = f"{gb:,}GB"
         elif mb > 1:
-            size = f'{mb:,}MB'
-        stats[i]['Size'] = size
+            size = f"{mb:,}MB"
+        stats[i]["Size"] = size
+
+    return stats
+
+
+def make_request(
+    url: str,
+    params={},
+    headers={},
+    max_attempts=10,
+    timeout=60,
+    pause=5,
+    locgov_json=False,
+    json=False,
+    is_blocked=False,
+    verbose=False,
+):
+    """
+    Params:
+     - url (str): URL to request.
+     - params (dict): Dictionary of parameters to pass to requests.get(). Used mostly
+        for API requests.
+     - headers (dict): HTTP headers to pass with request, such as User-Agent.
+     - max_attempts (int): Maximum number of attempts before returning a general
+        error. Default = 10.
+     - timeout (int): Number of seconds before requets.get() times out.
+     - pause (int): Number of baseline seconds between requests. This will increase
+        on retries.
+     - locgov_json (bool): True means that the response is a loc.gov JSON record
+     - other_json (bool): True means that the response is a JSON record (this is
+        ignored if locgov_json = True)
+     - is_blocked (bool): True means that the server has already returned a 429.
+        This is for use in loops where you'd like to hault all requests in the
+        event of a 429 status code.
+     - requested_fields (list): If you would like to output only certain fields,
+        list them here, e.g., ['aka','item.formats'] Default becomes {}. Only
+        works if json or locgov_json is True and response is a dictionary.
+
+    Returns:
+     - is_blocked. bool.
+     - result. JSON-like object if locgov_json or json are True; binary otherwise;
+        or one of string error messages listed below.
+
+    Error handling:
+        Pause and retry:
+            - Network/DNS issue (requests.get() error)
+            - status_code 5##
+            - loc.gov JSON with 'status' 5##
+            - loc.gov JSON with 'status' 4## (1 retry only)
+            - status code 404 if locgov_json = True (1 retry only)
+        Returns `False, "ERROR - NO RECORD"`:
+            - status code 404
+        Returns `False, 'ERROR - INVALID JSON'`:
+            - invalid loc.gov JSON
+            - invalid JSON
+        Returns `False,'ERROR - GENERAL'`:
+            - on final attempt, returns 5## status_code or loc.gov JSON 'status'.
+            - on final attempt, requests.get() error occurs
+            - status code 403
+        Returns `True, 'ERROR - BLOCKED'`:
+            - status_code 429 (too many requests, blocked by server)
+    """
+    i = 0
+    no_record = False, "ERROR - NO RECORD"  # value to return for URLs that don't exist
+    invalid = False, "ERROR - INVALID JSON"  # value to return for invalid JSON
+    error = False, "ERROR - GENERAL"  # value to return for all other errors
+    blocked = True, "ERROR - BLOCKED"  # value to return after receiving a 429
+    if is_blocked is True:
+        print("Blocked due to too many requests. Skipping {url} {params}")
+        return error
+    while i < max_attempts:
+        iter_pause = pause * (i + 1)
+        if verbose is True:
+            retry_msg = f" Trying again in {iter_pause} seconds . . ."
+            if i > 0:
+                print(retry_msg)
+        time.sleep(iter_pause)
+        if verbose is True:
+            message = f"Making request. Attempt #{i+1} for: {url} {params}"
+            print(message)
+
+        try:
+            response = requests.get(
+                url, params=params, timeout=timeout, headers=headers
+            )
+        except ConnectionError as e:
+            print(f"Connection error ({e}): {url} {params}.")
+            i += 1
+            continue
+        except Exception as e:
+            print(f"requests.get() failed ({e}): {url} {params}.")
+            i += 1
+            continue
+
+        # 429 too many requests
+        if response.status_code == 429:
+            print(f"Too many requests (429). Skipping: {url} {params}.")
+            return blocked
+
+        # 500 - 599 server error
+        elif (500 <= response.status_code) & (600 > response.status_code):
+            print(f"Server error ({response.status_code}): {url} {params}.")
+            i += 1
+            continue
+
+        # 403 forbidden
+        elif response.status_code == 403:
+            print(f"Forbidden request (403). Skipping: {url} {params}.")
+            return error
+
+        # 404 doesn't exist
+        elif response.status_code == 404:
+            if (i <= 2) and (locgov_json is True):
+                if verbose is True:
+                    print(
+                        f"Received 404 status_code (locgov_json is True, trying another time): {url} {params}."
+                    )
+                i += 1
+                continue
+            else:
+                print(f"Resource does not exist (404). Skipping: {url} {params}.")
+                return no_record
+
+        # Verify JSON (if applicable)
+        if locgov_json is True:
+            try:
+                output = response.json()
+                status = str(output.get("status"))
+
+                # loc.gov JSON 4##
+                if status.startswith("4"):
+                    if i > 2:  # only makes two attempts
+                        message = f"Resource does not exist (loc.gov {status}). Skipping: {url} {params}."
+                        print(message)
+                        return False, no_record
+                    else:
+                        if verbose is True:
+                            print("Received loc.gov JSON 404: {url} {params}")
+                        i += 1
+                        continue
+
+                # loc.gov JSON 5##
+                elif status.startswith("5"):
+                    if verbose is True:
+                        message = (
+                            f"Server error ({status}). Request for {url} {params}."
+                        )
+                        print(message)
+                    i += 1
+                    continue
+
+                # loc.gov valid JSON
+                else:
+                    # Success, return the loc.gov JSON record
+                    if verbose is True:
+                        print(f"Successfull request (loc.gov JSON): {url} {params}")
+                    # Proceed to RETURN SUCCESSFUL RESULT
+
+            # loc.gov invalid JSON
+            except Exception as e:
+                print(f"INVALID JSON ({e}): {url} {params}")
+                return invalid
+
+        elif json is True:
+            try:
+                output = response.json()
+                if verbose is True:
+                    print(f"Successfull request (JSON): {url} {params}")
+                # Proceed to RETURN SUCCESSFUL RESULT
+            except Exception as e:
+                print(f"INVALID JSON ({e}): {url} {params}")
+                return invalid
+
+        else:
+            if verbose is True:
+                print(f"Successfull request: {url} {params}")
+            # Proceed to RETURN SUCCESSFUL RESULT
+
+        # RETURN SUCCESSFUL RESULT
+        return False, output
 
-    return stats
+    # If hits max attempts, return general error
+    return error
diff --git a/Data Packages/requirements.txt b/Data Packages/requirements.txt
@@ -16,6 +16,9 @@ scipy
 # For printing data tables
 pandas
 
+# For data analysis
+scikit-learn
+
 # For reading and manipulating images
 Pillow
 
@@ -31,5 +34,8 @@ tqdm
 # For generating wordclouds
 wordcloud
 
+# For parsing XML and HTML
+beautifulsoup4
+
 # For reading, parsing PDFs
 pypdf