diff --git a/Data Packages/README.md b/Data Packages/README.md index 72cdf95..c3ca753 100644 --- a/Data Packages/README.md +++ b/Data Packages/README.md @@ -29,8 +29,8 @@ Each data package has a Jupyter notebook demonstrating how to access and use tha - [Digitized Books Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/digitized-books.ipynb) - [Directory Holdings Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/directories.ipynb) - [Telephone Directories Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/telephone.ipynb) +- [Selected Dot Gov Media Types, Web Archives Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/dot-gov-pdf.ipynb) - [Stereograph Card Images Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/stereographs.ipynb) - [Austro-Hungarian Map Set Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/austro-hungarian-maps.ipynb) - [General Collection Assessment Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/gen-coll-assessment.ipynb) - - +- [United States Elections, Web Archives Data Package](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/us-elections.ipynb) diff --git a/Data Packages/helpers.py b/Data Packages/helpers.py index 4ae169f..7083163 100644 --- a/Data Packages/helpers.py +++ b/Data Packages/helpers.py @@ -1,40 +1,227 @@ import os +import requests +import sys +import time + def get_file_stats(files): - + # For keeping track of files, grouped by file type ftypes = {} # Loop through files for file in files: - filename, file_extension = os.path.splitext(file['object_key']) + filename, file_extension = os.path.splitext(file["object_key"]) # Skip directories - if file_extension == '': + if file_extension == "": continue - + # Keep track of count and size if file_extension not in ftypes: - ftypes[file_extension] = {'Count': 0, 'Size': 0} - ftypes[file_extension]['Count'] += 1 - ftypes[file_extension]['Size'] += file['size'] + ftypes[file_extension] = {"Count": 0, "Size": 0} + ftypes[file_extension]["Count"] += 1 + ftypes[file_extension]["Size"] += file["size"] # Convert to list, sort file types by total size - stats = [{'FileType': ext, 'Count': ftypes[ext]['Count'], 'Size': ftypes[ext]['Size']} for ext in ftypes] - stats = sorted(stats, key=lambda s: -s['Size']) + stats = [ + {"FileType": ext, "Count": ftypes[ext]["Count"], "Size": ftypes[ext]["Size"]} + for ext in ftypes + ] + stats = sorted(stats, key=lambda s: -s["Size"]) # Format numbers for i, s in enumerate(stats): - stats[i]['Count'] = f'{s["Count"]:,}' - kb = round(s["Size"]/1000.0) - mb = round(s["Size"]/1000000.0) - gb = round(s["Size"]/1000000000.0, 2) - size = f'{kb:,}KB' + stats[i]["Count"] = f'{s["Count"]:,}' + kb = round(s["Size"] / 1000.0) + mb = round(s["Size"] / 1000000.0) + gb = round(s["Size"] / 1000000000.0, 2) + size = f"{kb:,}KB" if gb > 1: - size = f'{gb:,}GB' + size = f"{gb:,}GB" elif mb > 1: - size = f'{mb:,}MB' - stats[i]['Size'] = size + size = f"{mb:,}MB" + stats[i]["Size"] = size + + return stats + + +def make_request( + url: str, + params={}, + headers={}, + max_attempts=10, + timeout=60, + pause=5, + locgov_json=False, + json=False, + is_blocked=False, + verbose=False, +): + """ + Params: + - url (str): URL to request. + - params (dict): Dictionary of parameters to pass to requests.get(). Used mostly + for API requests. + - headers (dict): HTTP headers to pass with request, such as User-Agent. + - max_attempts (int): Maximum number of attempts before returning a general + error. Default = 10. + - timeout (int): Number of seconds before requets.get() times out. + - pause (int): Number of baseline seconds between requests. This will increase + on retries. + - locgov_json (bool): True means that the response is a loc.gov JSON record + - other_json (bool): True means that the response is a JSON record (this is + ignored if locgov_json = True) + - is_blocked (bool): True means that the server has already returned a 429. + This is for use in loops where you'd like to hault all requests in the + event of a 429 status code. + - requested_fields (list): If you would like to output only certain fields, + list them here, e.g., ['aka','item.formats'] Default becomes {}. Only + works if json or locgov_json is True and response is a dictionary. + + Returns: + - is_blocked. bool. + - result. JSON-like object if locgov_json or json are True; binary otherwise; + or one of string error messages listed below. + + Error handling: + Pause and retry: + - Network/DNS issue (requests.get() error) + - status_code 5## + - loc.gov JSON with 'status' 5## + - loc.gov JSON with 'status' 4## (1 retry only) + - status code 404 if locgov_json = True (1 retry only) + Returns `False, "ERROR - NO RECORD"`: + - status code 404 + Returns `False, 'ERROR - INVALID JSON'`: + - invalid loc.gov JSON + - invalid JSON + Returns `False,'ERROR - GENERAL'`: + - on final attempt, returns 5## status_code or loc.gov JSON 'status'. + - on final attempt, requests.get() error occurs + - status code 403 + Returns `True, 'ERROR - BLOCKED'`: + - status_code 429 (too many requests, blocked by server) + """ + i = 0 + no_record = False, "ERROR - NO RECORD" # value to return for URLs that don't exist + invalid = False, "ERROR - INVALID JSON" # value to return for invalid JSON + error = False, "ERROR - GENERAL" # value to return for all other errors + blocked = True, "ERROR - BLOCKED" # value to return after receiving a 429 + if is_blocked is True: + print("Blocked due to too many requests. Skipping {url} {params}") + return error + while i < max_attempts: + iter_pause = pause * (i + 1) + if verbose is True: + retry_msg = f" Trying again in {iter_pause} seconds . . ." + if i > 0: + print(retry_msg) + time.sleep(iter_pause) + if verbose is True: + message = f"Making request. Attempt #{i+1} for: {url} {params}" + print(message) + + try: + response = requests.get( + url, params=params, timeout=timeout, headers=headers + ) + except ConnectionError as e: + print(f"Connection error ({e}): {url} {params}.") + i += 1 + continue + except Exception as e: + print(f"requests.get() failed ({e}): {url} {params}.") + i += 1 + continue + + # 429 too many requests + if response.status_code == 429: + print(f"Too many requests (429). Skipping: {url} {params}.") + return blocked + + # 500 - 599 server error + elif (500 <= response.status_code) & (600 > response.status_code): + print(f"Server error ({response.status_code}): {url} {params}.") + i += 1 + continue + + # 403 forbidden + elif response.status_code == 403: + print(f"Forbidden request (403). Skipping: {url} {params}.") + return error + + # 404 doesn't exist + elif response.status_code == 404: + if (i <= 2) and (locgov_json is True): + if verbose is True: + print( + f"Received 404 status_code (locgov_json is True, trying another time): {url} {params}." + ) + i += 1 + continue + else: + print(f"Resource does not exist (404). Skipping: {url} {params}.") + return no_record + + # Verify JSON (if applicable) + if locgov_json is True: + try: + output = response.json() + status = str(output.get("status")) + + # loc.gov JSON 4## + if status.startswith("4"): + if i > 2: # only makes two attempts + message = f"Resource does not exist (loc.gov {status}). Skipping: {url} {params}." + print(message) + return False, no_record + else: + if verbose is True: + print("Received loc.gov JSON 404: {url} {params}") + i += 1 + continue + + # loc.gov JSON 5## + elif status.startswith("5"): + if verbose is True: + message = ( + f"Server error ({status}). Request for {url} {params}." + ) + print(message) + i += 1 + continue + + # loc.gov valid JSON + else: + # Success, return the loc.gov JSON record + if verbose is True: + print(f"Successfull request (loc.gov JSON): {url} {params}") + # Proceed to RETURN SUCCESSFUL RESULT + + # loc.gov invalid JSON + except Exception as e: + print(f"INVALID JSON ({e}): {url} {params}") + return invalid + + elif json is True: + try: + output = response.json() + if verbose is True: + print(f"Successfull request (JSON): {url} {params}") + # Proceed to RETURN SUCCESSFUL RESULT + except Exception as e: + print(f"INVALID JSON ({e}): {url} {params}") + return invalid + + else: + if verbose is True: + print(f"Successfull request: {url} {params}") + # Proceed to RETURN SUCCESSFUL RESULT + + # RETURN SUCCESSFUL RESULT + return False, output - return stats \ No newline at end of file + # If hits max attempts, return general error + return error diff --git a/Data Packages/requirements.txt b/Data Packages/requirements.txt index 0179c7a..e8cd8c3 100644 --- a/Data Packages/requirements.txt +++ b/Data Packages/requirements.txt @@ -16,6 +16,9 @@ scipy # For printing data tables pandas +# For data analysis +scikit-learn + # For reading and manipulating images Pillow @@ -31,5 +34,8 @@ tqdm # For generating wordclouds wordcloud +# For parsing XML and HTML +beautifulsoup4 + # For reading, parsing PDFs pypdf \ No newline at end of file diff --git a/Data Packages/us-elections.ipynb b/Data Packages/us-elections.ipynb new file mode 100644 index 0000000..55a17f6 --- /dev/null +++ b/Data Packages/us-elections.ipynb @@ -0,0 +1,2414 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LoC Data Package Tutorial: United States Elections, Web Archives Data Package\n", + "\n", + "version 2.0\n", + "\n", + "This notebook will demonstrate basic usage of using Python for interacting with [data packages from the Library of Congress](https://data.labs.loc.gov/packages/) via the [United States Elections, Web Archives Data Package](https://data.labs.loc.gov/us-elections/) which is derived from the Library's [United States Elections Web Archive](https://www.loc.gov/collections/united-states-elections-web-archive/). We will:\n", + "\n", + "1. [Output data package sumary](#Output-data-package-summary)\n", + "2. [Query the metadata in the data package](#Query-the-metadata-in-the-data-package)\n", + "4. [Filter and download CDX index files, analyze text](#Filter-and-download-CDX-index-files,-analyze-text)\n", + "\n", + "## Prerequisites\n", + "\n", + "In order to run this notebook, please follow the instructions listed in [this directory's README](https://github.com/LibraryOfCongress/data-exploration/blob/master/Data%20Packages/README.md)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output data package summary\n", + "\n", + "First, we will select [United States Elections, Web Archives Data Package](https://data.labs.loc.gov/us-elections/) and output a summary of its files." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTypeCountSize
0.gz394,950227.8GB
\n", + "
" + ], + "text/plain": [ + " FileType Count Size\n", + "0 .gz 394,950 227.8GB" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ast # For reading structured data from metadata.csv\n", + "import pandas as pd # For reading, manipulating, and displaying data\n", + "import requests # For retrieving online files\n", + "import sys # For general system tasks\n", + "\n", + "from helpers import get_file_stats, make_request\n", + "\n", + "# Set general variables we'll use throughout\n", + "DATA_URL = 'https://data.labs.loc.gov/us-elections/' # Base URL of this data package\n", + "PYTHON_VERSION = sys.version.split('|')[0] # We will use this in our request headers\n", + "HEADERS = { # This allows us to declare ourselves to Library of Congress servers\n", + " 'User-Agent':f'https://github.com/LibraryOfCongress/data-exploration/blob/master/Data Packages/us-elections.ipynb : 2.0 (python : {PYTHON_VERSION})'\n", + " } \n", + "\n", + "# Download the file manifest\n", + "file_manifest_url = f'{DATA_URL}manifest.json'\n", + "is_blocked, response = make_request(file_manifest_url, json=True)\n", + "if response is None:\n", + " print(f'There was an error retrieving the manifest file at {DATA_URL}manifest.json')\n", + "files = [dict(zip(response[\"cols\"], row)) for row in response[\"rows\"]] # zip columns and rows\n", + "\n", + "# Convert to Pandas DataFrame and show stats table\n", + "stats = get_file_stats(files)\n", + "pd.DataFrame(stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query the metadata in the data package\n", + "\n", + "Next we will download this data package's `metadata.csv` file, print a summary of various values, and demonstrate filtering options.\n", + "\n", + "The `metadata.csv` file lists all of the US election political candidates websites that have been collected as part of the [United States Elections Web Archive](https://www.loc.gov/collections/united-states-elections-web-archive/) and which are expected to be indexed in this data package's CDX index files. To read more about this data package's scope, see its `README`. \n", + "\n", + "Because the CDX index files are a mixed bag of additional content, the `metadata.csv` file can be used to target content just from political candidate website domains. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded metadata file with 13,388 entries.\n" + ] + } + ], + "source": [ + "metadata_url = f'{DATA_URL}metadata.json'\n", + "is_blocked, response = make_request(metadata_url, headers=HEADERS)\n", + "data = response.json()\n", + "\n", + "metadata_df = pd.DataFrame(data)\n", + "\n", + "print(f'Loaded metadata file with {len(metadata_df):,} entries.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next let's convert to pandas DataFrame and print the available properties." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "item_id, item_title, website_url, website_id, website_scopes, collection, website_elections, website_parties, website_places, website_districts, website_thumbnail, website_start_date, website_end_date, item_all_years, website_all_years, mods_url, access_condition\n" + ] + } + ], + "source": [ + "# metadata_df = pd.read_csv(r\"C:\\Users\\rtrent\\git\\lcwa-election-datasets\\metadata\\full_metadata_2000-2016.csv\", dtype=str) # just for testing\n", + "print(', '.join(metadata_df.columns.to_list()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check the campaign years represented in `metadata.csv`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2000', '2002', '2004', '2006', '2008', '2010', '2012', '2014', '2016']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collections = metadata_df['collection'].dropna().unique()\n", + "years = [collection.split(', ')[1] for collection in collections]\n", + "years.sort()\n", + "years" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interpreting the metadata fields\n", + "\n", + "The fields are defined in this package's `README`. Each row is a particular website collected for a specific candidate in a single election year. \n", + "\n", + "Let's look at an example row to understand how to interpret the fields. We'll write out a paragraph describing our example row. We'll look at row #`3460` (which we happen to know represents the only candidate in `metadata.csv` to have campaigned in two races in the same year under different parties):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# First, let's make sure that our dataframe columns with lists are interpretted correctly. \n", + "metadata_df['website_elections'] = metadata_df['website_elections'].apply(ast.literal_eval)\n", + "metadata_df['website_parties'] = metadata_df['website_parties'].apply(ast.literal_eval)\n", + "metadata_df['website_places'] = metadata_df['website_places'].apply(ast.literal_eval)\n", + "metadata_df['website_districts'] = metadata_df['website_districts'].apply(ast.literal_eval)\n", + "metadata_df['item_all_years'] = metadata_df['item_all_years'].apply(ast.literal_eval)\n", + "metadata_df['website_all_years'] = metadata_df['website_all_years'].apply(ast.literal_eval)\n", + "metadata_df['website_scopes'] = metadata_df['website_scopes'].apply(ast.literal_eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Record #3460 in the metadata.csv is: http://www.usmjp.com/, from the collection \"United States Elections, 2012\".\n", + "This row represents the website in 2012, used for campaign(s) of the candidate: Cris Ericson.\n", + "In 2012, this candidate used this website in 2 campaign(s):\n", + " 0. United States. Congress. Senate | U.S. Marijuana Party | Vermont | \n", + " 1. Vermont. Governor | Independent candidates | Vermont | \n", + "In total, this and possibly other websites were collected for this candidate in the following year(s): [2018, 2002, 2004, 2006, 2008, 2010, 2012]\n", + "The loc.gov item record for Cris Ericson campaign sites can be viewed at http://www.loc.gov/item/lcwaN0002501/, and its MODS record can be viewed at https://tile.loc.gov/storage-services/service/webcapture/project_1/mods/united-states-elections-web-archive/lcwaN0002501.xml.\n", + "Here is how this row appears in `metadata.csv`:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_iditem_titlewebsite_urlwebsite_idwebsite_scopescollectionwebsite_electionswebsite_partieswebsite_placeswebsite_districtswebsite_thumbnailwebsite_start_datewebsite_end_dateitem_all_yearswebsite_all_yearsmods_urlaccess_condition
3460http://www.loc.gov/item/lcwaN0002501/Official Campaign Web Site - Cris Ericsonhttp://www.usmjp.com/3415[http://crisericson.com, http://vermontnews.livejournal.com, http://www.myspace.com/usmjp2010, http://crisericson2010.blogspot.com]United States Elections, 2012[United States. Congress. Senate, Vermont. Governor][U.S. Marijuana Party, Independent candidates][Vermont, Vermont][None, None]http://cdn.loc.gov/service/webcapture/project_1/thumbnails/lcwaS0003415.jpg2012100320121019[2002, 2004, 2004, 2006, 2008, 2010, 2012, 2012, 2018, 2018][2012]https://tile.loc.gov/storage-services/service/webcapture/project_1/mods/united-states-elections-web-archive/lcwaN0002501.xmlNone
\n", + "
" + ], + "text/plain": [ + " item_id \\\n", + "3460 http://www.loc.gov/item/lcwaN0002501/ \n", + "\n", + " item_title website_url \\\n", + "3460 Official Campaign Web Site - Cris Ericson http://www.usmjp.com/ \n", + "\n", + " website_id \\\n", + "3460 3415 \n", + "\n", + " website_scopes \\\n", + "3460 [http://crisericson.com, http://vermontnews.livejournal.com, http://www.myspace.com/usmjp2010, http://crisericson2010.blogspot.com] \n", + "\n", + " collection \\\n", + "3460 United States Elections, 2012 \n", + "\n", + " website_elections \\\n", + "3460 [United States. Congress. Senate, Vermont. Governor] \n", + "\n", + " website_parties website_places \\\n", + "3460 [U.S. Marijuana Party, Independent candidates] [Vermont, Vermont] \n", + "\n", + " website_districts \\\n", + "3460 [None, None] \n", + "\n", + " website_thumbnail \\\n", + "3460 http://cdn.loc.gov/service/webcapture/project_1/thumbnails/lcwaS0003415.jpg \n", + "\n", + " website_start_date website_end_date \\\n", + "3460 20121003 20121019 \n", + "\n", + " item_all_years \\\n", + "3460 [2002, 2004, 2004, 2006, 2008, 2010, 2012, 2012, 2018, 2018] \n", + "\n", + " website_all_years \\\n", + "3460 [2012] \n", + "\n", + " mods_url \\\n", + "3460 https://tile.loc.gov/storage-services/service/webcapture/project_1/mods/united-states-elections-web-archive/lcwaN0002501.xml \n", + "\n", + " access_condition \n", + "3460 None " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row = 3460 # You can change this row number\n", + "\n", + "# We'll grab all the info we need from our row. \n", + "item_title = metadata_df.iloc[row]['item_title']\n", + "website_url = metadata_df.iloc[row]['website_url']\n", + "collection = metadata_df.iloc[row]['collection']\n", + "candidate_name = item_title.split('-')[1].strip()\n", + "year = collection.split(',')[1].strip()\n", + "campaign_count = len(metadata_df.iloc[row]['website_elections'])\n", + "website_elections = metadata_df.iloc[row]['website_elections']\n", + "website_parties = metadata_df.iloc[row]['website_parties']\n", + "website_places = metadata_df.iloc[row]['website_places']\n", + "website_districts = metadata_df.iloc[row]['website_districts']\n", + "website_all_years = metadata_df.iloc[row]['website_all_years']\n", + "website_all_years.sort()\n", + "item_all_years = metadata_df.iloc[row]['item_all_years']\n", + "item_all_years.sort()\n", + "item_id = metadata_df.iloc[row]['item_id']\n", + "mods_url = metadata_df.iloc[row]['mods_url']\n", + "\n", + "# Now we'll plug those variables into our sentences.\n", + "print(f'Record #{row} in the metadata.csv is: {website_url}, from the collection \"{collection}\".')\n", + "print(f'This row represents the website in {year}, used for campaign(s) of the candidate: {candidate_name}.') \n", + "print(f'In {year}, this candidate used this website in {campaign_count} campaign(s):')\n", + "i=0\n", + "while i < campaign_count:\n", + " if website_districts[i] is None:\n", + " house_district = '' \n", + " else:\n", + " house_district = website_districts[i]\n", + " print(f' {i}. {website_elections[i]} | {website_parties[i]} | {website_places[i]} | {house_district}')\n", + " i += 1\n", + "if len(website_all_years)>1: \n", + " print(f'This website ({website_url}) was also used for these other campaign year(s) for {candidate_name}: {list(set(website_all_years)-set(year))}')\n", + "print(f'In total, this and possibly other websites were collected for this candidate in the following year(s): {list(set(item_all_years))}')\n", + "print(f'The loc.gov item record for {candidate_name} campaign sites can be viewed at {item_id}, and its MODS record can be viewed at {mods_url}.')\n", + "\n", + "# The next line displays our dataframe as a table. Let's set it to show up to 300 characters in each cell\n", + "pd.options.display.max_colwidth = 300\n", + "\n", + "print('Here is how this row appears in `metadata.csv`:') \n", + "metadata_df[row:row+1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see about all the Vermont gubernatorial candidates represented in this data package." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found in metadata.csv: 0 unique campaign websites for 0 \"Vermont. Governor\" candidates, ranging from years n/a - n/a.\n", + "0 of these websites were used multiple years.\n" + ] + } + ], + "source": [ + "# We'll create a function to generate summary information about a given type of election\n", + "\n", + "def election_summary(election_type):\n", + " websites_by_year = metadata_df[metadata_df['website_elections'].apply(lambda elections: any(election_type is election for election in elections ))]\n", + " candidates = websites_by_year['item_title'].unique()\n", + " websites = websites_by_year['website_url'].unique()\n", + " years = [collection.split(',')[1].strip() for collection in websites_by_year['collection'].unique()]\n", + " min_year = min(years) if years else 'n/a'\n", + " max_year = max(years) if years else 'n/a'\n", + " multi_year_websites = websites_by_year[websites_by_year['website_all_years'].str.len()>1]['website_url'].unique()\n", + " print(f'Found in metadata.csv: {len(websites)} unique campaign websites for {len(candidates)} \"{election_type}\" candidates, ranging from years {min_year} - {max_year}.')\n", + " print(f'{len(multi_year_websites)} of these websites were used multiple years.')\n", + "\n", + "election_summary('Vermont. Governor')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Off-year elections aren't represented in this data package even though they are in the [United States Elections Web Archive](https://www.loc.gov/collections/united-states-elections-web-archive/) online collection. This is due to the way that content is organized in CDX files. \n", + "\n", + "For example, Virginia's gubernatorial elections are off-year elections (in odd-numbered years), and thus are not represented in this data package [even though they are in the online collection](https://www.loc.gov/collections/united-states-elections-web-archive/?fa=subject:governor%7Csubject:virginia).\n", + "\n", + "After you run the next cell, try replacing \"Virginia. Governor\" with something like \"United States. Congress. Senate\", \"United States. President\", or \"Michigan. Governor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found in metadata.csv: 0 unique campaign websites for 0 \"Virginia. Governor\" candidates, ranging from years n/a - n/a.\n", + "0 of these websites were used multiple years.\n" + ] + } + ], + "source": [ + "election_summary('Virginia. Governor')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter and Download CDX index files, analyze text\n", + "\n", + "The bulk of this dataset are CDX files. In this section, we'll retrieve a small sample of those CDX files and analyze the text inside them.\n", + "\n", + "Here we will define the functions in the order that they are used in this section of the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup # Used to process the scraped content\n", + "import gzip # Used to decompress the gzipped CDX files\n", + "from sklearn.feature_extraction.text import CountVectorizer # Used to create a matrix out of a bag of words\n", + "from time import sleep # Used to provide a slight pause between requests\n", + "\n", + "\n", + "WAYBACK_BASE_URL = 'https://webarchive.loc.gov/all/'\n", + "WAYBACK_LEGACY_BASE_URL = 'https://webarchive.loc.gov/legacy/'\n", + "\n", + "def gather_files_from_manifest(year: str):\n", + " \"\"\"\n", + " Function that takes a year (YYYY) as an argument.\n", + " The function collects the locations of the CDX files \n", + " listed by the provided year's manifest.\n", + " \n", + " Args:\n", + " year (str): String of a year YYYY.\n", + "\n", + " Returns:\n", + " :obj:`list` of :obj:`str` of individual CDX file URLs. In case\n", + " of error, returns an empty list.\n", + " \"\"\"\n", + " \n", + " election_years = [\n", + " \"2000\",\n", + " \"2002\",\n", + " \"2004\",\n", + " \"2006\",\n", + " \"2008\",\n", + " \"2010\",\n", + " \"2012\",\n", + " \"2014\",\n", + " \"2016\"\n", + " ]\n", + "\n", + " if year not in election_years:\n", + " return []\n", + " else:\n", + " try:\n", + " manifest_url = f\"{DATA_URL}by-year/{year}/manifest.html\"\n", + " is_blocked, response = make_request(manifest_url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " cdx_files = [link.get('href') for link in soup.find_all('a')]\n", + " return cdx_files\n", + " except:\n", + " print(f'There was an error retrieving and/or parsing {manifest_url}.')\n", + " return []\n", + "\n", + "\n", + "def fetch_file(cdx_url: str):\n", + " \"\"\"\n", + " Function that takes a `String` as an argument.\n", + " The `cdx_url` is a singular item from the result\n", + " of the `gather_files_from_manifest` function.\n", + " The function fetches the gzipped CDX file, decompresses it,\n", + " splits it on the newlines, and removes the header. \n", + " Args:\n", + " cdx_url (str): Individual item from the result of\n", + " the `gather_files_from_manifest` function.\n", + "\n", + " Returns:\n", + " :obj:`list` of :obj:`str` of individual CDX lines, each representing\n", + " a web object. Returns an empty list in case of errors.\n", + " \"\"\"\n", + " # Get the CDX file. For a production script, you'll want to build in additional error handling. \n", + " try:\n", + " response = requests.get(cdx_url)\n", + " except:\n", + " response = None\n", + " \n", + " # Here we decompress the gzipped CDX, decode it, split it on the newline, and remove the header\n", + " try:\n", + " cdx_content = gzip.decompress(response.content).decode('utf-8').split('\\n')[1:]\n", + " return cdx_content\n", + " except:\n", + " print(f'There was an error parsing the decompressing CDX file: {cdx_url}. This file will be skipped.')\n", + " return []\n", + "\n", + "\n", + "def create_dataframe(data: list):\n", + " \"\"\"\n", + " Function that takes a :obj:`list` of :obj:`str` as an argument.\n", + " `data` is the contents of the CDX file split on newlines. \n", + " This function takes `data`, applies a schema to it, and transforms it\n", + " into a `pandas.DataFrame`.\n", + " Args:\n", + " data (list): :obj:`list` of :obj:`str`. Each item is a line from\n", + " a CDX file or group of files.\n", + "\n", + " Returns:\n", + " A `pandas.DataFrame` of a CDX file or group of files. In case of error,\n", + " a blank pandas.DataFrame is returned.\n", + " \"\"\"\n", + " schema = [\n", + " 'urlkey',\n", + " 'timestamp',\n", + " 'original',\n", + " 'mimetype',\n", + " 'statuscode',\n", + " 'digest',\n", + " 'redirect',\n", + " 'metatags',\n", + " 'file_size',\n", + " 'offset',\n", + " 'warc_filename'\n", + " ]\n", + " try:\n", + " _data = [row.split() for row in data]\n", + " df = pd.DataFrame(_data, columns=schema)\n", + " return df\n", + " except:\n", + " print('There was an error converting the data into a dataframe. Returning\\\n", + " a blank dataframe.')\n", + " return pd.DataFrame()\n", + "\n", + "def create_dataframe_from_manifest(manifest: list):\n", + " \"\"\"\n", + " Function that takes a :obj:`list` of :obj:`str` as an argument.\n", + " The `manifest` is a list of all the individual CDX files found\n", + " from an Election year's or group of Election years' HTML manifest.\n", + " This function loops through each file, transforms it into a `pandas.DataFrame`\n", + " by calling the `create_dataframe` function, concats the DataFrames together,\n", + " and then returns the Dataframe representing the entire manifest.\n", + " Args:\n", + " manifest (list): :obj:`list` of :obj:`str` of all the individual CDX files found\n", + " from an Election year's or group of Election years' HTML manifest.\n", + "\n", + " Returns:\n", + " `pandas.DataFrame` representing every file present in the `manifest`.\n", + " \"\"\"\n", + " df = pd.DataFrame() \n", + " for index, cdx_url in enumerate(manifest):\n", + " cdx = fetch_file(cdx_url)\n", + " if len(cdx) == 0:\n", + " continue\n", + " try:\n", + " new_rows = create_dataframe(cdx)\n", + " df = pd.concat([df, new_rows])\n", + " except:\n", + " print(f'There was an error converting {cdx_url} to a dataframe. This may be\\\n", + " due to a malformed CDX file. This data will be skipped.')\n", + " return df\n", + "\n", + "def fetch_text(row: pd.Series):\n", + " \"\"\"\n", + " Function that takes a `pandas.Series`, which is a single row \n", + " from a `pandas.DataFrame`, as an argument.\n", + " The functions uses the timestamp and original fields from the `row`\n", + " to request the specific resource from OpenWayback. Once the resource is \n", + " fetched, the Wayback banner div elements are removed so as to not detract \n", + " from the words in the resource itself. \n", + " Args:\n", + " row (pandas.Series): `pandas.Series`, which is a single row \n", + " from a `pandas.DataFrame`.\n", + "\n", + " Returns:\n", + " `String` of the resource's text. If an error is encountered, returns \n", + " an empty string.\n", + " \"\"\"\n", + " playback_url = row['original']\n", + " if (row['timestamp'] is None) or (row['timestamp']==''):\n", + " print(f'CDX row is missing timestamp. Not retrieving text for {playback_url}')\n", + " return ''\n", + " timestamp = row['timestamp']\n", + " if timestamp.startswith('2000'):\n", + " base_url = WAYBACK_LEGACY_BASE_URL\n", + " else:\n", + " base_url = WAYBACK_BASE_URL\n", + " is_blocked, response = make_request(f\"{base_url}{timestamp}/{playback_url}\", pause=15)\n", + " if response is None:\n", + " print(f'Error retrieving {base_url}{timestamp}/{playback_url}. Skipping full text for this document.')\n", + " return ''\n", + " if is_blocked is True:\n", + " print(f'429 too many requests. Skipping: {base_url}{timestamp}/{playback_url}')\n", + " return 429\n", + " try:\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " [el.extract() for el in soup.find_all('div', {'id': 'wm-maximized'})]\n", + " [el.extract() for el in soup.find_all('div', {'id': 'wm-minimized'})]\n", + " return soup.text\n", + " except:\n", + " print(f'Error parsing full text from {base_url}{timestamp}/{playback_url}. Skipping full text for this document.')\n", + " return ''\n", + "\n", + "def fetch_all_text(df: pd.DataFrame):\n", + " \"\"\"\n", + " Function that takes a `pandas.Dataframe` as an argument.\n", + " This is the most complicated function here. The function first cleans the\n", + " `df` that was passed in by dropping all the rows that do not have a value in the\n", + " mimetype field. Then, it drops all the duplicate digests, which removes resources\n", + " that are exactly the same. Finally, it only returns rows that have 'text' in the \n", + " mimetype field and have a '200' or '-' HTTP status response.\n", + " Once the `df` is cleaned, each resource's text is fetched from the Wayback,\n", + " transformed into a matrix using `sklearn.CountVectorizer`, and then returns a `pandas.DataFrame`\n", + " with words and their occurance per resource. A politeness of 15 seconds is added between Wayback requests.\n", + " Args:\n", + " row (pandas.DataFrame): `pandas.Dataframe` representing web resources as CDX lines.\n", + "\n", + " Returns:\n", + " `pandas.Dataframe` of the resource's words tabulated per web resource.\n", + " \"\"\"\n", + " countvec = CountVectorizer(ngram_range=(1,1), stop_words='english')\n", + " unprocessed_bag_of_words = []\n", + " text_df = df\\\n", + " .dropna(subset=['mimetype'])\\\n", + " .drop_duplicates(subset=['digest'])\\\n", + " .query(\n", + " (\n", + " ('statuscode.str.match(\"200\") or statuscode.str.match(\"-\")') and \n", + " ('mimetype.str.contains(\"text\")') \n", + " ), \n", + " engine='python'\n", + " )\n", + " for i, row in text_df.iterrows():\n", + " fetched_text = fetch_text(row)\n", + " if fetched_text == 429:\n", + " print('Haulting requests for web archives. Received a 429 error from the server, which means too many requests too quickly.')\n", + " break\n", + " unprocessed_bag_of_words.append(fetched_text)\n", + " \n", + " \n", + " processed_bag_of_words = countvec.fit_transform(unprocessed_bag_of_words)\n", + " \n", + " return pd.DataFrame(processed_bag_of_words.toarray(),columns=countvec.get_feature_names_out())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Gathering the list of CDX Files\n", + "\n", + "The first step is gathering the list of CDX files. To do that, simply call the `gather_files_from_manifest` function, providing the Election year as an argument." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "el00_files = gather_files_from_manifest('2000')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at our first five files:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://data.labs.loc.gov/us-elections/by-year/2000/cdx/unique.20010415093936.surt.cdx.gz',\n", + " 'https://data.labs.loc.gov/us-elections/by-year/2000/cdx/unique.20010415094743.surt.cdx.gz',\n", + " 'https://data.labs.loc.gov/us-elections/by-year/2000/cdx/unique.20010415095044.surt.cdx.gz',\n", + " 'https://data.labs.loc.gov/us-elections/by-year/2000/cdx/unique.20010415095244.surt.cdx.gz',\n", + " 'https://data.labs.loc.gov/us-elections/by-year/2000/cdx/unique.20010415095459.surt.cdx.gz']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "el00_files[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect a sample CDX File\n", + "\n", + "Next, we'll demonstrate what a particular CDX File looks like. We'll look at the first five lines of our first CDX from 2000. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "cdx = fetch_file(el00_files[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['com,voter)/home/candidates/info/0,1214,2-11880-,00.html 20001002182124 http://www.voter.com:80/home/candidates/info/0,1214,2-11880-,00.html text/html 200 FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP - - 5051 149 unique.20010415093936.arc.gz',\n", + " 'com,voter)/home/candidates/info/0,1214,2-18885-,00.html 20001002185814 http://www.voter.com:80/home/candidates/info/0,1214,2-18885-,00.html text/html 200 H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL - - 4829 5200 unique.20010415093936.arc.gz',\n", + " 'com,voter)/home/candidates/info/0,1214,2-18880-,00.html 20001002185815 http://www.voter.com:80/home/candidates/info/0,1214,2-18880-,00.html text/html 200 HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6 - - 4794 10029 unique.20010415093936.arc.gz',\n", + " 'com,voter)/home/officials/general/1,1195,2-2467-,00.html 20001002185815 http://voter.com:80/home/officials/general/1,1195,2-2467-,00.html text/html 200 HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O - - 5282 14823 unique.20010415093936.arc.gz',\n", + " 'com,voter)/home/candidates/info/0,1214,2-18886-,00.html 20001002185816 http://www.voter.com:80/home/candidates/info/0,1214,2-18886-,00.html text/html 200 QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO - - 4823 20105 unique.20010415093936.arc.gz']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cdx[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, here is the same CDX transformed into a DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlkeytimestamporiginalmimetypestatuscodedigestredirectmetatagsfile_sizeoffsetwarc_filename
0com,voter)/home/candidates/info/0,1214,2-11880-,00.html20001002182124http://www.voter.com:80/home/candidates/info/0,1214,2-11880-,00.htmltext/html200FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP--5051149unique.20010415093936.arc.gz
1com,voter)/home/candidates/info/0,1214,2-18885-,00.html20001002185814http://www.voter.com:80/home/candidates/info/0,1214,2-18885-,00.htmltext/html200H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL--48295200unique.20010415093936.arc.gz
2com,voter)/home/candidates/info/0,1214,2-18880-,00.html20001002185815http://www.voter.com:80/home/candidates/info/0,1214,2-18880-,00.htmltext/html200HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6--479410029unique.20010415093936.arc.gz
3com,voter)/home/officials/general/1,1195,2-2467-,00.html20001002185815http://voter.com:80/home/officials/general/1,1195,2-2467-,00.htmltext/html200HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O--528214823unique.20010415093936.arc.gz
4com,voter)/home/candidates/info/0,1214,2-18886-,00.html20001002185816http://www.voter.com:80/home/candidates/info/0,1214,2-18886-,00.htmltext/html200QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO--482320105unique.20010415093936.arc.gz
....................................
1096875com,voter)/home/candidates/info/0,1214,2-9118-,00.html20001002183052http://www.voter.com:80/home/candidates/info/0,1214,2-9118-,00.html--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--118145323588unique.20010415093936.arc.gz
1096876com,voter)/home/candidates/info/0,1214,2-9115-,00.html20001002183052http://www.voter.com:80/home/candidates/info/0,1214,2-9115-,00.html--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--118145323706unique.20010415093936.arc.gz
1096877com,voter)/home/candidates/info/0,1214,2-15361-,00.html20001002182249http://www.voter.com:80/home/candidates/info/0,1214,2-15361-,00.html--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--119145323824unique.20010415093936.arc.gz
1096878com,voter)/home/candidates/info/0,1214,2-12994-,00.html20001002181842http://www.voter.com:80/home/candidates/info/0,1214,2-12994-,00.htmltext/html404UDSH36NBYWO2X73LNMX2LEHLNQ7FYXHZ--351145323943unique.20010415093936.arc.gz
1096879NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", + "

1096880 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " urlkey \\\n", + "0 com,voter)/home/candidates/info/0,1214,2-11880-,00.html \n", + "1 com,voter)/home/candidates/info/0,1214,2-18885-,00.html \n", + "2 com,voter)/home/candidates/info/0,1214,2-18880-,00.html \n", + "3 com,voter)/home/officials/general/1,1195,2-2467-,00.html \n", + "4 com,voter)/home/candidates/info/0,1214,2-18886-,00.html \n", + "... ... \n", + "1096875 com,voter)/home/candidates/info/0,1214,2-9118-,00.html \n", + "1096876 com,voter)/home/candidates/info/0,1214,2-9115-,00.html \n", + "1096877 com,voter)/home/candidates/info/0,1214,2-15361-,00.html \n", + "1096878 com,voter)/home/candidates/info/0,1214,2-12994-,00.html \n", + "1096879 None \n", + "\n", + " timestamp \\\n", + "0 20001002182124 \n", + "1 20001002185814 \n", + "2 20001002185815 \n", + "3 20001002185815 \n", + "4 20001002185816 \n", + "... ... \n", + "1096875 20001002183052 \n", + "1096876 20001002183052 \n", + "1096877 20001002182249 \n", + "1096878 20001002181842 \n", + "1096879 None \n", + "\n", + " original \\\n", + "0 http://www.voter.com:80/home/candidates/info/0,1214,2-11880-,00.html \n", + "1 http://www.voter.com:80/home/candidates/info/0,1214,2-18885-,00.html \n", + "2 http://www.voter.com:80/home/candidates/info/0,1214,2-18880-,00.html \n", + "3 http://voter.com:80/home/officials/general/1,1195,2-2467-,00.html \n", + "4 http://www.voter.com:80/home/candidates/info/0,1214,2-18886-,00.html \n", + "... ... \n", + "1096875 http://www.voter.com:80/home/candidates/info/0,1214,2-9118-,00.html \n", + "1096876 http://www.voter.com:80/home/candidates/info/0,1214,2-9115-,00.html \n", + "1096877 http://www.voter.com:80/home/candidates/info/0,1214,2-15361-,00.html \n", + "1096878 http://www.voter.com:80/home/candidates/info/0,1214,2-12994-,00.html \n", + "1096879 None \n", + "\n", + " mimetype statuscode digest redirect \\\n", + "0 text/html 200 FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP - \n", + "1 text/html 200 H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL - \n", + "2 text/html 200 HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6 - \n", + "3 text/html 200 HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O - \n", + "4 text/html 200 QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO - \n", + "... ... ... ... ... \n", + "1096875 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "1096876 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "1096877 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "1096878 text/html 404 UDSH36NBYWO2X73LNMX2LEHLNQ7FYXHZ - \n", + "1096879 None None None None \n", + "\n", + " metatags file_size offset warc_filename \n", + "0 - 5051 149 unique.20010415093936.arc.gz \n", + "1 - 4829 5200 unique.20010415093936.arc.gz \n", + "2 - 4794 10029 unique.20010415093936.arc.gz \n", + "3 - 5282 14823 unique.20010415093936.arc.gz \n", + "4 - 4823 20105 unique.20010415093936.arc.gz \n", + "... ... ... ... ... \n", + "1096875 - 118 145323588 unique.20010415093936.arc.gz \n", + "1096876 - 118 145323706 unique.20010415093936.arc.gz \n", + "1096877 - 119 145323824 unique.20010415093936.arc.gz \n", + "1096878 - 351 145323943 unique.20010415093936.arc.gz \n", + "1096879 None None None None \n", + "\n", + "[1096880 rows x 11 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cdx_df = create_dataframe(cdx)\n", + "cdx_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Election 2000 DataFrame\n", + "\n", + "Now we'll create a DataFrame from the first fifteen CDX files in the 2000 election subset. To do that, we'll use the `create_dataframe_from_manifest` which loops over the CDX files and calls `create_dataframe` programmatically instead of manually and individually as we did above. \n", + "\n", + "If we had more time or were working on a more powerful computer, we'd pull from all of the files in the 2000 subset, but for now we'll just pull from the first ten. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "el00_df = create_dataframe_from_manifest(el00_files[0:15])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlkeytimestamporiginalmimetypestatuscodedigestredirectmetatagsfile_sizeoffsetwarc_filename
0com,voter)/home/candidates/info/0,1214,2-11880-,00.html20001002182124http://www.voter.com:80/home/candidates/info/0,1214,2-11880-,00.htmltext/html200FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP--5051149unique.20010415093936.arc.gz
1com,voter)/home/candidates/info/0,1214,2-18885-,00.html20001002185814http://www.voter.com:80/home/candidates/info/0,1214,2-18885-,00.htmltext/html200H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL--48295200unique.20010415093936.arc.gz
2com,voter)/home/candidates/info/0,1214,2-18880-,00.html20001002185815http://www.voter.com:80/home/candidates/info/0,1214,2-18880-,00.htmltext/html200HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6--479410029unique.20010415093936.arc.gz
3com,voter)/home/officials/general/1,1195,2-2467-,00.html20001002185815http://voter.com:80/home/officials/general/1,1195,2-2467-,00.htmltext/html200HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O--528214823unique.20010415093936.arc.gz
4com,voter)/home/candidates/info/0,1214,2-18886-,00.html20001002185816http://www.voter.com:80/home/candidates/info/0,1214,2-18886-,00.htmltext/html200QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO--482320105unique.20010415093936.arc.gz
....................................
338148org,ctgop)/county/tolland.htm20001006073643http://www.ctgop.org:80/county/tolland.htm--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--10179251104unique.20010415101811.arc.gz
338149org,ctgop)/county/tolland.htm20001005073549http://www.ctgop.org:80/county/tolland.htm--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--10179251205unique.20010415101811.arc.gz
338150org,ctgop)/county/tolland.htm20001004073505http://www.ctgop.org:80/county/tolland.htm--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--10179251306unique.20010415101811.arc.gz
338151org,ctgop)/county/tolland.htm20001003073437http://www.ctgop.org:80/county/tolland.htmtext/html200TIRWMHRDJ5L22TJWCXVA6TNU5YOB65SW--142179251407unique.20010415101811.arc.gz
338152NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", + "

1541579 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " urlkey \\\n", + "0 com,voter)/home/candidates/info/0,1214,2-11880-,00.html \n", + "1 com,voter)/home/candidates/info/0,1214,2-18885-,00.html \n", + "2 com,voter)/home/candidates/info/0,1214,2-18880-,00.html \n", + "3 com,voter)/home/officials/general/1,1195,2-2467-,00.html \n", + "4 com,voter)/home/candidates/info/0,1214,2-18886-,00.html \n", + "... ... \n", + "338148 org,ctgop)/county/tolland.htm \n", + "338149 org,ctgop)/county/tolland.htm \n", + "338150 org,ctgop)/county/tolland.htm \n", + "338151 org,ctgop)/county/tolland.htm \n", + "338152 None \n", + "\n", + " timestamp \\\n", + "0 20001002182124 \n", + "1 20001002185814 \n", + "2 20001002185815 \n", + "3 20001002185815 \n", + "4 20001002185816 \n", + "... ... \n", + "338148 20001006073643 \n", + "338149 20001005073549 \n", + "338150 20001004073505 \n", + "338151 20001003073437 \n", + "338152 None \n", + "\n", + " original \\\n", + "0 http://www.voter.com:80/home/candidates/info/0,1214,2-11880-,00.html \n", + "1 http://www.voter.com:80/home/candidates/info/0,1214,2-18885-,00.html \n", + "2 http://www.voter.com:80/home/candidates/info/0,1214,2-18880-,00.html \n", + "3 http://voter.com:80/home/officials/general/1,1195,2-2467-,00.html \n", + "4 http://www.voter.com:80/home/candidates/info/0,1214,2-18886-,00.html \n", + "... ... \n", + "338148 http://www.ctgop.org:80/county/tolland.htm \n", + "338149 http://www.ctgop.org:80/county/tolland.htm \n", + "338150 http://www.ctgop.org:80/county/tolland.htm \n", + "338151 http://www.ctgop.org:80/county/tolland.htm \n", + "338152 None \n", + "\n", + " mimetype statuscode digest redirect \\\n", + "0 text/html 200 FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP - \n", + "1 text/html 200 H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL - \n", + "2 text/html 200 HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6 - \n", + "3 text/html 200 HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O - \n", + "4 text/html 200 QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO - \n", + "... ... ... ... ... \n", + "338148 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "338149 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "338150 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "338151 text/html 200 TIRWMHRDJ5L22TJWCXVA6TNU5YOB65SW - \n", + "338152 None None None None \n", + "\n", + " metatags file_size offset warc_filename \n", + "0 - 5051 149 unique.20010415093936.arc.gz \n", + "1 - 4829 5200 unique.20010415093936.arc.gz \n", + "2 - 4794 10029 unique.20010415093936.arc.gz \n", + "3 - 5282 14823 unique.20010415093936.arc.gz \n", + "4 - 4823 20105 unique.20010415093936.arc.gz \n", + "... ... ... ... ... \n", + "338148 - 101 79251104 unique.20010415101811.arc.gz \n", + "338149 - 101 79251205 unique.20010415101811.arc.gz \n", + "338150 - 101 79251306 unique.20010415101811.arc.gz \n", + "338151 - 1421 79251407 unique.20010415101811.arc.gz \n", + "338152 None None None None \n", + "\n", + "[1541579 rows x 11 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "el00_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mimetypes\n", + "\n", + "For this exercise, we're going to take a brief look at the mimetypes. First, we'll select all the mimetypes in the Dataframe and get their sums by calling `value_counts` which is a method from Pandas." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "el00_mimetypes = el00_df['mimetype'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mimetype\n", + "- 1493256\n", + "text/html 43969\n", + "image/jpeg 2756\n", + "image/gif 1311\n", + "application/pdf 122\n", + "text/plain 59\n", + "image/bmp 28\n", + "audio/x-pn-realaudio 18\n", + "application/msword 11\n", + "text/css 4\n", + "image/png 4\n", + "application/octet-stream 3\n", + "application/x-javascript 3\n", + "video/quicktime 3\n", + "application/zip 2\n", + "audio/x-wav 2\n", + "audio/midi 2\n", + "text/xml 2\n", + "application/mac-binhex40 1\n", + "audio/x-aiff 1\n", + "image/tiff 1\n", + "application/x-tar 1\n", + "application/x-pointplus 1\n", + "audio/x-midi 1\n", + "video/x-msvideo 1\n", + "audio/basic 1\n", + "audio/x-mpeg 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "el00_mimetypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering by domain\n", + "\n", + "Let's now look at the domains and subdomains represented in the 2000 CDX files. We'll ignore the \"www\" part of URLs, but otherwise retain subdomains. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "voter.com\n", + "whitehouse.gov\n", + "hayes.voter.com\n", + "freespeech.org\n", + "cnn.com\n", + "freedomchannel.com\n", + "essential.org\n", + "fsudemocrats.org\n", + "commoncause.org\n", + "democrats.org\n", + "uspolitics.about.com\n", + "enterstageright.com\n", + "reason.com\n", + "usconservatives.about.com\n", + "idaho-democrats.org\n", + "usliberals.about.com\n", + "www10.nytimes.com\n", + "election.voter.com\n", + "graphics.nytimes.com\n", + "nydems.org\n", + "adams.voter.com\n", + "mockelection.com\n", + "rpk.org\n", + "dnet.org\n", + "commonconservative.com\n", + "beavoter.org\n", + "beavoter.com\n", + "iowademocrats.org\n", + "forums.about.com\n", + "thiselection.com\n", + "indiaelection.com\n", + "server1.dscc.org\n", + "search.algore2000.com\n", + "forums.nytimes.com\n", + "azlp.org\n", + "intellectualcapital.com\n", + "prospect.org\n", + "grassroots.com\n", + "rnc.org\n", + "lwv.org\n", + "mn-politics.com\n", + "newwestpolitics.com\n", + "popandpolitics.com\n", + "washingtonpost.com\n", + "nacdnet.org\n", + "lp.org\n", + "algore2000.com\n", + "crlp.org\n", + "harrybrowne2000.org\n", + "ga.lp.org\n", + "emilyslist.org\n", + "ncgop.org\n", + "arkdems.org\n", + "cbdnet.org\n", + "keyes-grassroots.com\n", + "faqvoter.com\n", + "americanprospect.org\n", + "partners.nytimes.com\n", + "indems.org\n", + "ageofreason.com\n", + "vanishingvoter.org\n", + "nyc.dnet.org\n", + "robots.cnn.com\n", + "informedvoter.com\n", + "virginiapolitics.com\n", + "newpolitics.com\n", + "nan\n", + "md.lp.org\n", + "ca-dem.org\n", + "beachdemocrats.org\n", + "ohiodems.org\n", + "maryland.reformparty.org\n", + "muscatinedemocrats.org\n", + "9thdistgagop.org\n", + "rcdnet.org\n", + "azgop.org\n", + "maricopagop.org\n", + "kansas.reformparty.org\n", + "newjersey.reformparty.org\n", + "california.reformparty.org\n", + "timeline.reformparty.org\n", + "algop.org\n", + "pelicanpolitics.com\n", + "espanol.voter.com\n", + "gorelieberman.com\n", + "election.com\n", + "ceednet.org\n", + "followthemoney.org\n", + "debates.org\n", + "cagop.org\n", + "wsrp.org\n", + "indgop.org\n", + "members.freespeech.org\n", + "schoolelection.com\n", + "convention.texasgop.org\n", + "cal.votenader.org\n", + "candidate.grassroots.com\n", + "1-877-leadnow.com\n", + "madison.voter.com\n", + "sierraclub.org\n", + "mt.nacdnet.org\n", + "ma.lwv.org\n", + "irchelp.org\n", + "calvoter.org\n", + "njdems.org\n", + "sfvoter.com\n", + "vademocrats.org\n", + "reformparty.org\n", + "missouridems.org\n", + "pa.lwv.org\n", + "akdemocrats.org\n", + "njlp.org\n", + "hagelin.org\n", + "keyes2000.org\n", + "tray.com\n", + "nrsc.org\n", + "deldems.org\n", + "nrcc.org\n", + "ksdp.org\n", + "kansasyoungdemocrats.org\n", + "washington.reformparty.org\n", + "dems2000.com\n", + "arkgop.com\n", + "scdp.org\n", + "plp.org\n", + "votenader.org\n", + "votenader.com\n", + "northcarolina.reformparty.org\n", + "ca.lwv.org\n", + "ks.nacdnet.org\n", + "txdemocrats.org\n", + "politics1.com\n", + "gagop.org\n", + "slp.org\n", + "gwbush.com\n", + "akrepublicans.org\n", + "wi.nacdnet.org\n", + "green.votenader.org\n", + "rpv.org\n", + "fec.gov\n", + "nytimes.com\n", + "naacp.org\n", + "hawaiidemocrats.org\n", + "nygop.org\n", + "gopatgo2000.org\n", + "democratsabroad.org\n", + "pub.whitehouse.gov\n", + "archive.lp.org\n", + "gop-mn.org\n", + "migop.org\n", + "ca.lp.org\n", + "monmouthlp.org\n", + "ncdp.org\n", + "cologop.org\n", + "mi.lp.org\n", + "cobbdemocrats.org\n", + "tx.lp.org\n", + "campaignoffice.com\n", + "freetrial.campaignoffice.com\n", + "calendar.rnc.org\n", + "rireformparty.org\n", + "ehdemocrats.org\n", + "poll1.debates.org\n", + "nevadagreenparty.org\n", + "newvoter.com\n", + "mi.lwv.org\n", + "georgia.reformparty.org\n", + "delaware.reformparty.org\n", + "stonewalldfl.org\n", + "santacruzlp.org\n", + "forums.hagelin.org\n", + "forum.hagelin.org\n", + "iowagop.org\n", + "ohiogop.org\n", + "sddemocrats.org\n", + "skdemocrats.org\n", + "wisdems.org\n", + "sfgreenparty.org\n", + "il.lp.org\n", + "rtumble.com\n", + "ctdems.org\n", + "alaskarepublicans.com\n", + "detroitnaacp.org\n", + "greenparty.org\n", + "ndgop.com\n", + "nh-democrats.org\n", + "rosecity.net\n", + "sandiegovoter.com\n", + "montanagop.org\n", + "dc.reformparty.org\n", + "greenparties.org\n", + "mainegop.com\n", + "stmarysdemocrats.org\n", + "comalcountydemocrats.org\n", + "masonforrnc.org\n", + "sblp.org\n", + "chesapeakedemocrats.org\n", + "tejanodemocrats.org\n", + "connecticut.georgewbush.com\n", + "students.georgewbush.com\n", + "youngprofessionals.georgewbush.com\n", + "maine.georgewbush.com\n", + "latinos.georgewbush.com\n", + "veterans.georgewbush.com\n", + "africanamericans.georgewbush.com\n", + "missouri.georgewbush.com\n", + "agriculture.georgewbush.com\n", + "mississippi.georgewbush.com\n", + "minnesota.georgewbush.com\n", + "arizona.georgewbush.com\n", + "northcarolina.georgewbush.com\n", + "virginia.georgewbush.com\n", + "kentucky.georgewbush.com\n", + "texas.georgewbush.com\n", + "lvvlwv.org\n", + "kansassenatedemocrats.org\n", + "nhgop.org\n", + "nebraskademocrats.org\n", + "southcarolina.reformparty.org\n", + "tndemocrats.org\n", + "fcncgop.org\n", + "padems.com\n", + "gore-2000.com\n", + "union.arkdems.org\n", + "illinois.reformparty.org\n", + "nevadagop.org\n", + "rhodeisland.reformparty.org\n", + "massdems.org\n", + "allencountydemocrats.org\n", + "mogop.org\n", + "oklahoma.reformparty.org\n", + "oklp.org\n", + "speakout.com\n", + "windemocrats.org\n", + "washingtoncountydemocrats.org\n", + "salinecodemocrats.org\n", + "njgop.org\n", + "sddp.org\n", + "pennsylvania.reformparty.org\n", + "lademo.org\n", + "allgore.com\n", + "web.democrats.org\n", + "pagop.org\n", + "library.whitehouse.gov\n", + "docs.whitehouse.gov\n", + "idaho.reformparty.org\n", + "alaska.net\n", + "georgybush.com\n", + "rpof.org\n", + "publishing1.speakout.com\n", + "de.lp.org\n", + "mainedems.org\n", + "clarkgop.com\n", + "kansashousedemocrats.org\n", + "georgiaparty.com\n", + "la.lp.org\n", + "ny.lp.org\n", + "nebraska.reformparty.org\n", + "maine.reformparty.org\n", + "indiana.reformparty.org\n", + "myweb.clark.net\n", + "clark.net\n", + "ga.lwv.org\n", + "traviscountydemocrats.org\n", + "cheshiredemocrats.org\n", + "exchange.nrcc.org\n", + "growthelp.org\n", + "sbdemocrats.org\n", + "montana.reformparty.org\n", + "politicalshop.com\n", + "massgop.com\n", + "ohio.reformparty.org\n", + "scgop.com\n", + "wvgop.org\n", + "c-span.org\n", + "westvirginia.reformparty.org\n", + "wwwalgore2000.com\n", + "texas.reformparty.org\n", + "florida-democrats.org\n", + "delawaregop.com\n", + "publicrelations.reformparty.org\n", + "nj.nacdnet.org\n", + "ohionlp.org\n", + "communications.reformparty.org\n", + "newhampshire.reformparty.org\n", + "aladems.org\n", + "arkansas.reformparty.org\n", + "avlp.org\n", + "vtdemocrats.org\n", + "jackgreenlp.org\n", + "waynegop.org\n", + "mi-democrats.com\n", + "13thdistrictdems.org\n", + "rules.reformparty.org\n", + "negop.org\n", + "dscc.org\n", + "mccain2000.com\n", + "oclp.org\n", + "ilgop.org\n", + "hawaii.reformparty.org\n", + "arch-cgi.lp.org\n", + "crnc.org\n", + "sc.ca.lp.org\n", + "8thcd.vademocrats.org\n", + "foreignpolicy2000.org\n", + "bradely.campaignoffice.com\n", + "wwwsanderson.campaignoffice.com\n", + "florida.reformparty.org\n", + "al.lp.org\n", + "dpo.org\n", + "oahudemocrats.org\n", + "columbia.arkdems.org\n", + "kentucky.reformparty.org\n", + "phoenixnewtimes.com\n", + "purepolitics.com\n", + "concernedvoter.com\n", + "iowa.reformparty.org\n", + "wyoming.reformparty.org\n", + "harriscountygreenparty.org\n", + "american-politics.com\n", + "issues.reformparty.org\n", + "nysrtlp.org\n", + "stpaul.mn.lwv.org\n", + "arlingtondemocrats.org\n", + "okgop.com\n", + "utahgop.org\n", + "utdemocrats.org\n", + "mississippi.reformparty.org\n", + "plymouth.ma.nacdnet.org\n", + "tennessee.reformparty.org\n", + "minnesota.reformparty.org\n", + "dpnm.org\n", + "georgebush2000.com\n", + "vayoungdemocrats.org\n", + "northdakota.reformparty.org\n", + "stonewalldemocrats.org\n", + "virginia.reformparty.org\n", + "fastlane.net\n", + "youngdemocrats.org\n", + "msgop.org\n", + "calgop.org\n", + "votegrassroots.com\n", + "wvdemocrats.com\n", + "housedems2000.com\n", + "lubbockdemocrats.org\n", + "ildems.org\n", + "okdemocrats.org\n", + "lccdnet.org\n", + "fecweb1.fec.gov\n", + "trinity.ca.lp.org\n", + "ventura.ca.lp.org\n", + "3rdcd.vademocrats.org\n", + "de.lwv.org\n", + "mdgop.org\n", + "flgopsenate.campaignoffice.com\n", + "bradley.campaignoffice.com\n", + "kydems.campaignoffice.com\n", + "tx.nacdnet.org\n", + "mo.nacdnet.org\n", + "texasgop.org\n", + "in.rcdnet.org\n", + "life.ca.lp.org\n", + "victory.texasgop.org\n", + "charlestondemocrats.org\n", + "wyomingdemocrats.com\n", + "nd.nacdnet.org\n", + "college.reformparty.org\n", + "al.nacdnet.org\n", + "nddemnpl.campaignoffice.com\n", + "kulick-jackson.campaignoffice.com\n", + "wasiluk.campaignoffice.com\n", + "hilstrom.campaignoffice.com\n", + "schumacher.campaignoffice.com\n", + "dfl.org\n", + "slawik.campaignoffice.com\n", + "markthompson.campaignoffice.com\n", + "rest.campaignoffice.com\n", + "vigil.campaignoffice.com\n", + "graves.campaignoffice.com\n", + "mcinnis.campaignoffice.com\n", + "hoosier.campaignoffice.com\n", + "connor.campaignoffice.com\n", + "bernardy.campaignoffice.com\n", + "housedflcaucus.campaignoffice.com\n", + "goodwin.campaignoffice.com\n", + "peaden.campaignoffice.com\n", + "kurita.campaignoffice.com\n", + "hi.lp.org\n", + "mtdemocrats.org\n", + "or.nacdnet.org\n", + "kcswcd.mo.nacdnet.org\n", + "id.nacdnet.org\n", + "sd.nacdnet.org\n", + "ny.nacdnet.org\n", + "yvote2000.com\n", + "oh.nacdnet.org\n", + "va.nacdnet.org\n", + "tn.nacdnet.org\n", + "fl.nacdnet.org\n", + "ca.nacdnet.org\n", + "co.nacdnet.org\n", + "ky.lp.org\n", + "georgewbush.com\n", + "massachusetts.reformparty.org\n", + "arizona.reformparty.org\n", + "louisiana.reformparty.org\n", + "nm.nacdnet.org\n", + "tazewell.va.nacdnet.org\n", + "aflcio.org\n", + "azdem.org\n", + "columbiana.oh.nacdnet.org\n", + "lacledeswcd.mo.nacdnet.org\n", + "reclaimdemocracy.org\n", + "ctgop.org\n", + "nevada.reformparty.org\n", + "in.nacdnet.org\n", + "michigan.reformparty.org\n", + "newyork.reformparty.org\n", + "nc.nacdnet.org\n", + "wa.nacdnet.org\n", + "ak.nacdnet.org\n", + "pa.nacdnet.org\n", + "billbradley.com\n", + "macdnet.org\n", + "lmcd.mt.nacdnet.org\n", + "socialdemocrats.org\n", + "bexardemocrats.org\n", + "alabama.reformparty.org\n", + "globalelection.com\n", + "wisconsin.reformparty.org\n", + "geocities.com\n", + "coloradodems.org\n" + ] + } + ], + "source": [ + "import re # For using regular expressions to remove parts of URLs\n", + "from urllib.parse import urlparse # For locating the base domain in URLs\n", + "\n", + "def get_domains(urls):\n", + " if urls is None:\n", + " return []\n", + " if type(urls) == str:\n", + " urls = [urls]\n", + " domains = set()\n", + " for url in urls:\n", + " parsed_url = urlparse(url)\n", + " domain = parsed_url.netloc\n", + " if type(domain) == bytes:\n", + " domain = None\n", + "\n", + " # Remove \"www.\" and ports if they exist\n", + " if domain is None or domain == '':\n", + " continue\n", + " else:\n", + " # Remove www., www1., etc.\n", + " domain = re.sub(r\"www\\d?\\.(.*)\", r\"\\1\", domain)\n", + " # Remove ports, as in some-website.com:80\n", + " domain = domain.split(':')[0]\n", + " domains.add(domain)\n", + " return list(domains)\n", + "\n", + "el00_df['domains'] = el00_df['original'].apply(get_domains).str[0]\n", + "for cdx_domain in el00_df['domains'].unique():\n", + " print(cdx_domain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see from the list above, the 2000 CDX files include content from a wide range of domains, not limited to political candidate campaign websites. \n", + "\n", + "In the early years of the [United States Elections Web Archive](https://www.loc.gov/collections/united-states-elections-web-archive/), the scope of the collection included websites of political parties, government, advocacy groups, bloggers, and other individuals and groups expressing relevant views. These sites have generally been moved into the [Public Policy Topics Web Archive](https://www.loc.gov/collections/public-policy-topics-web-archive/) or into the general web archives. However, the CDX files index the content as it was originally captured. The CDX files may also index content from non-candidate resources if candidate sites linked to those resources or embedded that content. Occassionally, other out of scope content may also appear in CDX files otherwise dedicated to U.S. elections. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's grab only those lines from the CDX files that match domains from the candidate websites in our `metadata.csv` file. We'll include the campaign candidates' websites themselves, as well as any domains that appear in the `scope` column. Domains that appear in the `scope` column are additional URLs that the web archiving crawler was instructed to collect in addition to the campaign website, if the campaign website linked to those URLs. For a more refined description, see this data package's `README`." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Domains from the 2000 US Elections collection:\n" + ] + }, + { + "data": { + "text/plain": [ + "['algore2000.com',\n", + " 'harrybrowne2000.org',\n", + " 'gopatgo2000.org',\n", + " 'algore.com',\n", + " 'keyes2000.org',\n", + " 'hagelin.org',\n", + " 'forum.hagelin.org']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "year = 2000\n", + "\n", + "def get_domains_by_year(year):\n", + " year_metadata = metadata_df[metadata_df['collection'].str.contains(year)].copy()\n", + " if len(year_metadata) > 0:\n", + " year_metadata['seeds_domains'] = year_metadata['website_url'].apply(get_domains)\n", + " year_metadata['scope_domains'] = year_metadata['website_scopes'].apply(get_domains)\n", + " year_metadata['all_domains'] = year_metadata['seeds_domains'] + year_metadata['scope_domains']\n", + " all_domains = [item for sublist in year_metadata['all_domains'].dropna() for item in sublist]\n", + " return list(set(all_domains))\n", + " else:\n", + " print(f'Sorry, there were no rows in metadata.csv for content from {year}')\n", + "\n", + "metadata_domains = get_domains_by_year(str(year))\n", + "print(f'Domains from the {str(year)} US Elections collection:')\n", + "metadata_domains\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we're ready to filter. Let's filter down our sample CDX lines to just those lines that point to the candidate website domains from metadata.csv, listed above. This means we'll only include CDX rows for domains like `algore2000.org`, `gopatgo2000.org`, but not sites like `vote.com` or `whitehouse.gov`. " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlkeytimestamporiginalmimetypestatuscodedigestredirectmetatagsfile_sizeoffsetwarc_filenamedomains
25175com,algore2000,search)/search20001030063531http://search.algore2000.com:80/search/--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--977471624unique.20010415093936.arc.gzsearch.algore2000.com
26166com,algore2000,search)/search20001030053022http://search.algore2000.com:80/search/--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--977587973unique.20010415093936.arc.gzsearch.algore2000.com
49892com,algore2000,search)/search20001029053020http://search.algore2000.com:80/search/--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--9710612154unique.20010415093936.arc.gzsearch.algore2000.com
73526com,algore2000,search)/search20001028053001http://search.algore2000.com:80/search/--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--9913619683unique.20010415093936.arc.gzsearch.algore2000.com
97191com,algore2000,search)/search20001027053201http://search.algore2000.com:80/search/--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--9816632272unique.20010415093936.arc.gzsearch.algore2000.com
.......................................
336264org,keyes2000)/images/newsimage.jpg20001003073434http://keyes2000.org:80/images/newsimage.jpgimage/jpeg200LWERVVNORJQ6IBZCJ4SBNH26JU6NH3MV--1352776178594unique.20010415101811.arc.gzkeyes2000.org
336611com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html20001004075816http://www.algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--13076906140unique.20010415101811.arc.gzalgore2000.com
336612com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html20001004073516http://algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--12776906270unique.20010415101811.arc.gzalgore2000.com
336613com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html20001003075840http://www.algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--13076906397unique.20010415101811.arc.gzalgore2000.com
336614com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html20001003073434http://algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.htmltext/html2006Y6BX6SUDNF5CASBJH2LASINQ46ASMQF--960676906527unique.20010415101811.arc.gzalgore2000.com
\n", + "

6448 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " urlkey \\\n", + "25175 com,algore2000,search)/search \n", + "26166 com,algore2000,search)/search \n", + "49892 com,algore2000,search)/search \n", + "73526 com,algore2000,search)/search \n", + "97191 com,algore2000,search)/search \n", + "... ... \n", + "336264 org,keyes2000)/images/newsimage.jpg \n", + "336611 com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html \n", + "336612 com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html \n", + "336613 com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html \n", + "336614 com,algore2000)/briefingroom/releases/pr_091300_gore_wins_4.html \n", + "\n", + " timestamp \\\n", + "25175 20001030063531 \n", + "26166 20001030053022 \n", + "49892 20001029053020 \n", + "73526 20001028053001 \n", + "97191 20001027053201 \n", + "... ... \n", + "336264 20001003073434 \n", + "336611 20001004075816 \n", + "336612 20001004073516 \n", + "336613 20001003075840 \n", + "336614 20001003073434 \n", + "\n", + " original \\\n", + "25175 http://search.algore2000.com:80/search/ \n", + "26166 http://search.algore2000.com:80/search/ \n", + "49892 http://search.algore2000.com:80/search/ \n", + "73526 http://search.algore2000.com:80/search/ \n", + "97191 http://search.algore2000.com:80/search/ \n", + "... ... \n", + "336264 http://keyes2000.org:80/images/newsimage.jpg \n", + "336611 http://www.algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html \n", + "336612 http://algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html \n", + "336613 http://www.algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html \n", + "336614 http://algore2000.com:80/briefingroom/releases/pr_091300_Gore_Wins_4.html \n", + "\n", + " mimetype statuscode digest redirect \\\n", + "25175 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "26166 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "49892 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "73526 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "97191 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "... ... ... ... ... \n", + "336264 image/jpeg 200 LWERVVNORJQ6IBZCJ4SBNH26JU6NH3MV - \n", + "336611 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "336612 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "336613 - - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - \n", + "336614 text/html 200 6Y6BX6SUDNF5CASBJH2LASINQ46ASMQF - \n", + "\n", + " metatags file_size offset warc_filename \\\n", + "25175 - 97 7471624 unique.20010415093936.arc.gz \n", + "26166 - 97 7587973 unique.20010415093936.arc.gz \n", + "49892 - 97 10612154 unique.20010415093936.arc.gz \n", + "73526 - 99 13619683 unique.20010415093936.arc.gz \n", + "97191 - 98 16632272 unique.20010415093936.arc.gz \n", + "... ... ... ... ... \n", + "336264 - 13527 76178594 unique.20010415101811.arc.gz \n", + "336611 - 130 76906140 unique.20010415101811.arc.gz \n", + "336612 - 127 76906270 unique.20010415101811.arc.gz \n", + "336613 - 130 76906397 unique.20010415101811.arc.gz \n", + "336614 - 9606 76906527 unique.20010415101811.arc.gz \n", + "\n", + " domains \n", + "25175 search.algore2000.com \n", + "26166 search.algore2000.com \n", + "49892 search.algore2000.com \n", + "73526 search.algore2000.com \n", + "97191 search.algore2000.com \n", + "... ... \n", + "336264 keyes2000.org \n", + "336611 algore2000.com \n", + "336612 algore2000.com \n", + "336613 algore2000.com \n", + "336614 algore2000.com \n", + "\n", + "[6448 rows x 12 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cdx_candidate_domains_el00 = el00_df[\n", + " el00_df['original'].apply(\n", + " lambda url: \n", + " any(domain in url for domain in metadata_domains) if url \n", + " else False\n", + " )\n", + "]\n", + "cdx_candidate_domains_el00" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fetching the Text\n", + "\n", + "Now that we know the majority of the remaining resources in this dataset have a text-based mimetype, we can gather all the text and do some basic analysis. First, we'll fetch all the text from just 50 rows. This will take a few minutes. " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "text_df = fetch_all_text(cdx_candidate_domains_el00.tail(50))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top 25 Words\n", + "\n", + "Now that the text has been fetched, we'll do a simple summation and sorting, displaying the top 25 words from the first 50 rows of the 2000 Election dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tax 34\n", + "bush 34\n", + "cut 18\n", + "plan 16\n", + "republicans 11\n", + "new 11\n", + "gop 9\n", + "republican 7\n", + "budget 7\n", + "vote 7\n", + "00 7\n", + "gore 7\n", + "senate 6\n", + "committee 6\n", + "year 6\n", + "george 6\n", + "news 5\n", + "americans 5\n", + "2000 5\n", + "congressional 5\n", + "york 5\n", + "dakota 4\n", + "carolina 4\n", + "virginia 4\n", + "debt 4\n", + "dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_df.sum(axis=0).sort_values(ascending=False).head(25)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}