-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
example #12 is done and ready for merge
- Loading branch information
Showing
1 changed file
with
254 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Finding datasets that have stopped being updated\n", | ||
"This notebook shows how to find datasets that have stopped being updated.\n", | ||
"It looks across all datasets and finds the datasets that have been stopped and then \n", | ||
"does analysis with the table types to show which have the highest percentage of being stopped.\n", | ||
"This can be extended to find for departments that have stopped releasing certain types of data, what other types are they still releasing." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import openpolicedata as opd\n", | ||
"from openpolicedata.exceptions import OPD_DataUnavailableError\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import matplotlib.pyplot as plt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Get all datasets but exclude the openpolicing.stanford.edu to prevent duplicates\n", | ||
"all_datasets = opd.datasets.query()\n", | ||
"all_datasets['source_url'] = all_datasets['source_url'].astype(str)\n", | ||
"all_datasets = all_datasets[~all_datasets['source_url'].str.contains(\"openpolicing.stanford.edu\")]\n", | ||
"\n", | ||
"unique_source_state_combinations = all_datasets[['SourceName', 'State']].drop_duplicates()\n", | ||
"print(unique_source_state_combinations) " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Limit the dataframe columns for analysis\n", | ||
"selected_columns = ['State', 'SourceName', 'TableType', 'coverage_start', 'coverage_end']\n", | ||
"filtered_df = pd.DataFrame(all_datasets, columns=selected_columns)\n", | ||
"\n", | ||
"filtered_df = filtered_df.dropna(subset=['coverage_start', 'coverage_end'])\n", | ||
"\n", | ||
"# Create entries for each year instead of a range of years. This will create more rows, but will make it easier to filter the data\n", | ||
"filtered_df['ListOfYears'] = filtered_df.apply(\n", | ||
" lambda row: list(range(int(row['coverage_start'].year), int(row['coverage_end'].year) + 1)), axis=1)\n", | ||
"filtered_df = filtered_df.drop(columns=['coverage_start', 'coverage_end'])\n", | ||
"#filtered_df = filtered_df.explode('ListOfYears')\n", | ||
"\n", | ||
"print(filtered_df.head())\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"# Load data again from the file\n", | ||
"df = filtered_df\n", | ||
"# modify the TableType column values to remove the \" - \" and following text if it exists otherwise keep the original value\n", | ||
"df['TableType'] = df['TableType'].str.split(' - ').str[0]\n", | ||
"# remove all rows where grouped is None\n", | ||
"df = df.dropna(subset=['ListOfYears'])\n", | ||
"\n", | ||
"# Calculate the minimum and maximum years from the combined lists\n", | ||
"grouped=df\n", | ||
"grouped['MinYear'] = grouped['ListOfYears'].apply(min)\n", | ||
"grouped['MaxYear'] = grouped['ListOfYears'].apply(max)\n", | ||
"\n", | ||
"# Create a label for plotting\n", | ||
"grouped['Label'] = grouped['SourceName'] + ', ' + grouped['State'] + ', ' + grouped['TableType']\n", | ||
"\n", | ||
"# create a horizontal bar chart where each element of the vertical axis is the unique combination of the 'SourceName', 'State', 'TableType' columns\n", | ||
"# the horizontal axis should be the years of data available for each unique combination of 'SourceName', 'State', 'TableType'\n", | ||
"plt.figure(figsize=(7, 60))\n", | ||
"plt.barh(grouped['Label'], grouped['MaxYear'] - grouped['MinYear'], left=grouped['MinYear'], color='blue', height=0.8)\n", | ||
"plt.xlabel('Year')\n", | ||
"plt.ylabel('Source, State, Table Type')\n", | ||
"plt.title('Years of Data Available by Source, State, and Table Type')\n", | ||
"\n", | ||
"for index, row in grouped.iterrows():\n", | ||
" plt.text(row['MinYear'], index, f\"{row['MinYear']} - {row['MaxYear']}\", va='center', color='white')\n", | ||
"\n", | ||
"plt.show()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Find the datasets where the data is most likely to be stopped within the year\n", | ||
"current_year = 2024\n", | ||
"minimum_tabletype_counts = 10\n", | ||
"\n", | ||
"grouped['MaxYear'] = grouped['MaxYear'].fillna(0).astype(int)\n", | ||
"tabletype_counts = grouped['TableType'].value_counts()\n", | ||
"\n", | ||
"tabletype_counts = tabletype_counts[tabletype_counts >= minimum_tabletype_counts]\n", | ||
"grouped = grouped[grouped['TableType'].isin(tabletype_counts.index)]\n", | ||
"\n", | ||
"stopped_datasets = grouped[grouped['MaxYear'] < (current_year-1)]\n", | ||
"stopped_tabletype_counts = stopped_datasets['TableType'].value_counts()\n", | ||
"\n", | ||
"\n", | ||
"# compute a bar graph histogram of the number of datasets that are stopped by TableType\n", | ||
"fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))\n", | ||
"\n", | ||
"tabletype_counts.plot(kind='bar', ax=axes[0])\n", | ||
"axes[0].set_xlabel('Table Type')\n", | ||
"axes[0].set_ylabel('Number of Datasets')\n", | ||
"axes[0].set_title('Number of All Datasets by Table Type')\n", | ||
"\n", | ||
"stopped_tabletype_counts.plot(kind='bar', ax=axes[1])\n", | ||
"axes[1].set_xlabel('Table Type')\n", | ||
"axes[1].set_ylabel('Number of Datasets')\n", | ||
"axes[1].set_title('Number of Stopped Datasets by Table Type')\n", | ||
"\n", | ||
"plt.tight_layout()\n", | ||
"plt.show()\n", | ||
"\n", | ||
"# Find which type has the highest ratio of stopped datasets\n", | ||
"ratio = stopped_tabletype_counts / tabletype_counts\n", | ||
"ratio = ratio.fillna(0)\n", | ||
"\n", | ||
"# Create a bar plot of the ratio and sort the values from high to low\n", | ||
"ratio = ratio.sort_values(ascending=False)\n", | ||
"plt.figure(figsize=(10, 6))\n", | ||
"ratio.plot(kind='bar')\n", | ||
"plt.xlabel('Table Type')\n", | ||
"plt.ylabel('Ratio of Stopped Datasets to All Datasets')\n", | ||
"plt.title('Ratio of Stopped Datasets to All Datasets by Table Type')\n", | ||
"plt.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# For each unique combination of SourceName, State, and AgencyFull find the number of new datasets that have been released \n", | ||
"# in the current year for all TableTypes\n", | ||
"\n", | ||
"current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year == current_year) | (all_datasets['coverage_end'].dt.year == current_year - 1)]\n", | ||
"current_year_datasets_sum = current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull'])['TableType'].nunique()\n", | ||
"current_year_datasets_sum = current_year_datasets_sum.sort_values(ascending=False)\n", | ||
"\n", | ||
"print(current_year_datasets_sum)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# For each unique combination of SourceName, State, and AgencyFull find the number of stopped datasets which is \n", | ||
"# defined as not being released this year or the previous year\n", | ||
"stopped_current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year != current_year) | (all_datasets['coverage_end'].dt.year != current_year - 1)]\n", | ||
"\n", | ||
"# Group the filtered datasets by SourceName and State, and count the number of unique TableTypes\n", | ||
"stopped_current_year_datasets_sum = stopped_current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull' ])['TableType'].nunique()\n", | ||
"\n", | ||
"# sort the new_datasets_count in descending order\n", | ||
"stopped_current_year_datasets_sum = stopped_current_year_datasets_sum.sort_values(ascending=False)\n", | ||
"\n", | ||
"# Print the number of new datasets for each unique combination of SourceName and State\n", | ||
"print(stopped_current_year_datasets_sum)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# plot the current_year_datasets_sum and stopped_current_year_datasets_sum on a scatter chart where the x-axis is the number of new datasets\n", | ||
"# first match only the SourceName, State, and AgencyFull that are in both current_year_datasets_sum and stopped_current_year_datasets_sum\n", | ||
"# and the y-axis is the number of stopped datasets.\n", | ||
"\n", | ||
"\n", | ||
"merged_df = pd.merge(current_year_datasets_sum, stopped_current_year_datasets_sum, \n", | ||
" on=['SourceName', 'State', 'AgencyFull'], \n", | ||
" suffixes=('_current', '_stopped'))\n", | ||
"\n", | ||
"markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']\n", | ||
"colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'lime', 'navy', 'teal']\n", | ||
"\n", | ||
"plt.figure(figsize=(10, 6))\n", | ||
"labels = []\n", | ||
"handles = []\n", | ||
"\n", | ||
"# Annotate each point with the index names and collect labels for the legend\n", | ||
"for i, (current, stopped) in enumerate(zip(merged_df['TableType_current'], merged_df['TableType_stopped'])):\n", | ||
" index_label = ', '.join(map(str, merged_df.index[i]))\n", | ||
" shortened_label = ''\n", | ||
" marker = markers[i % len(markers)] \n", | ||
" color = colors[i % len(colors)] \n", | ||
" plt.scatter(current, stopped, color=color, s=100, marker=marker)\n", | ||
" plt.annotate(shortened_label, (current, stopped), textcoords=\"offset points\", xytext=(0,10), ha='center')\n", | ||
" labels.append(index_label)\n", | ||
" handles.append(plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor=color, markersize=10, label=index_label))\n", | ||
"\n", | ||
"\n", | ||
"plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)\n", | ||
"plt.xlabel('Number of New Datasets')\n", | ||
"plt.ylabel('Number of Stopped Datasets')\n", | ||
"plt.title('New vs. Stopped Datasets by Source, State, and AgencyFull')\n", | ||
"plt.show()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3.9.12 ('opd')", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.11" | ||
}, | ||
"orig_nbformat": 4, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "a73158d29711b2da05ac73de25b71e5d8cae591f14917bba77a9573b5c85a0ce" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |