Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

example #12 is done and ready for merge #13

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions finding_stopped_datasets.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Finding datasets that have stopped being updated\n",
"This notebook shows how to find datasets that have stopped being updated.\n",
"It looks across all datasets and finds the datasets that have been stopped and then \n",
"does analysis with the table types to show which have the highest percentage of being stopped.\n",
"This can be extended to find for departments that have stopped releasing certain types of data, what other types are they still releasing."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import openpolicedata as opd\n",
"from openpolicedata.exceptions import OPD_DataUnavailableError\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get all datasets but exclude the openpolicing.stanford.edu to prevent duplicates\n",
"all_datasets = opd.datasets.query()\n",
"all_datasets['source_url'] = all_datasets['source_url'].astype(str)\n",
"all_datasets = all_datasets[~all_datasets['source_url'].str.contains(\"openpolicing.stanford.edu\")]\n",
"\n",
"unique_source_state_combinations = all_datasets[['SourceName', 'State']].drop_duplicates()\n",
"print(unique_source_state_combinations) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Limit the dataframe columns for analysis\n",
"selected_columns = ['State', 'SourceName', 'TableType', 'coverage_start', 'coverage_end']\n",
"filtered_df = pd.DataFrame(all_datasets, columns=selected_columns)\n",
"\n",
"filtered_df = filtered_df.dropna(subset=['coverage_start', 'coverage_end'])\n",
"\n",
"# Create entries for each year instead of a range of years. This will create more rows, but will make it easier to filter the data\n",
"filtered_df['ListOfYears'] = filtered_df.apply(\n",
" lambda row: list(range(int(row['coverage_start'].year), int(row['coverage_end'].year) + 1)), axis=1)\n",
"filtered_df = filtered_df.drop(columns=['coverage_start', 'coverage_end'])\n",
"#filtered_df = filtered_df.explode('ListOfYears')\n",
"\n",
"print(filtered_df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Load data again from the file\n",
"df = filtered_df\n",
"# modify the TableType column values to remove the \" - \" and following text if it exists otherwise keep the original value\n",
"df['TableType'] = df['TableType'].str.split(' - ').str[0]\n",
"# remove all rows where grouped is None\n",
"df = df.dropna(subset=['ListOfYears'])\n",
"\n",
"# Calculate the minimum and maximum years from the combined lists\n",
"grouped=df\n",
"grouped['MinYear'] = grouped['ListOfYears'].apply(min)\n",
"grouped['MaxYear'] = grouped['ListOfYears'].apply(max)\n",
"\n",
"# Create a label for plotting\n",
"grouped['Label'] = grouped['SourceName'] + ', ' + grouped['State'] + ', ' + grouped['TableType']\n",
"\n",
"# create a horizontal bar chart where each element of the vertical axis is the unique combination of the 'SourceName', 'State', 'TableType' columns\n",
"# the horizontal axis should be the years of data available for each unique combination of 'SourceName', 'State', 'TableType'\n",
"plt.figure(figsize=(7, 60))\n",
"plt.barh(grouped['Label'], grouped['MaxYear'] - grouped['MinYear'], left=grouped['MinYear'], color='blue', height=0.8)\n",
"plt.xlabel('Year')\n",
"plt.ylabel('Source, State, Table Type')\n",
"plt.title('Years of Data Available by Source, State, and Table Type')\n",
"\n",
"for index, row in grouped.iterrows():\n",
" plt.text(row['MinYear'], index, f\"{row['MinYear']} - {row['MaxYear']}\", va='center', color='white')\n",
"\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Find the datasets where the data is most likely to be stopped within the year\n",
"current_year = 2024\n",
"minimum_tabletype_counts = 10\n",
"\n",
"grouped['MaxYear'] = grouped['MaxYear'].fillna(0).astype(int)\n",
"tabletype_counts = grouped['TableType'].value_counts()\n",
"\n",
"tabletype_counts = tabletype_counts[tabletype_counts >= minimum_tabletype_counts]\n",
"grouped = grouped[grouped['TableType'].isin(tabletype_counts.index)]\n",
"\n",
"stopped_datasets = grouped[grouped['MaxYear'] < (current_year-1)]\n",
"stopped_tabletype_counts = stopped_datasets['TableType'].value_counts()\n",
"\n",
"\n",
"# compute a bar graph histogram of the number of datasets that are stopped by TableType\n",
"fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))\n",
"\n",
"tabletype_counts.plot(kind='bar', ax=axes[0])\n",
"axes[0].set_xlabel('Table Type')\n",
"axes[0].set_ylabel('Number of Datasets')\n",
"axes[0].set_title('Number of All Datasets by Table Type')\n",
"\n",
"stopped_tabletype_counts.plot(kind='bar', ax=axes[1])\n",
"axes[1].set_xlabel('Table Type')\n",
"axes[1].set_ylabel('Number of Datasets')\n",
"axes[1].set_title('Number of Stopped Datasets by Table Type')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Find which type has the highest ratio of stopped datasets\n",
"ratio = stopped_tabletype_counts / tabletype_counts\n",
"ratio = ratio.fillna(0)\n",
"\n",
"# Create a bar plot of the ratio and sort the values from high to low\n",
"ratio = ratio.sort_values(ascending=False)\n",
"plt.figure(figsize=(10, 6))\n",
"ratio.plot(kind='bar')\n",
"plt.xlabel('Table Type')\n",
"plt.ylabel('Ratio of Stopped Datasets to All Datasets')\n",
"plt.title('Ratio of Stopped Datasets to All Datasets by Table Type')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For each unique combination of SourceName, State, and AgencyFull find the number of new datasets that have been released \n",
"# in the current year for all TableTypes\n",
"\n",
"current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year == current_year) | (all_datasets['coverage_end'].dt.year == current_year - 1)]\n",
"current_year_datasets_sum = current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull'])['TableType'].nunique()\n",
"current_year_datasets_sum = current_year_datasets_sum.sort_values(ascending=False)\n",
"\n",
"print(current_year_datasets_sum)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For each unique combination of SourceName, State, and AgencyFull find the number of stopped datasets which is \n",
"# defined as not being released this year or the previous year\n",
"stopped_current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year != current_year) | (all_datasets['coverage_end'].dt.year != current_year - 1)]\n",
"\n",
"# Group the filtered datasets by SourceName and State, and count the number of unique TableTypes\n",
"stopped_current_year_datasets_sum = stopped_current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull' ])['TableType'].nunique()\n",
"\n",
"# sort the new_datasets_count in descending order\n",
"stopped_current_year_datasets_sum = stopped_current_year_datasets_sum.sort_values(ascending=False)\n",
"\n",
"# Print the number of new datasets for each unique combination of SourceName and State\n",
"print(stopped_current_year_datasets_sum)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plot the current_year_datasets_sum and stopped_current_year_datasets_sum on a scatter chart where the x-axis is the number of new datasets\n",
"# first match only the SourceName, State, and AgencyFull that are in both current_year_datasets_sum and stopped_current_year_datasets_sum\n",
"# and the y-axis is the number of stopped datasets.\n",
"\n",
"\n",
"merged_df = pd.merge(current_year_datasets_sum, stopped_current_year_datasets_sum, \n",
" on=['SourceName', 'State', 'AgencyFull'], \n",
" suffixes=('_current', '_stopped'))\n",
"\n",
"markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']\n",
"colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'lime', 'navy', 'teal']\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"labels = []\n",
"handles = []\n",
"\n",
"# Annotate each point with the index names and collect labels for the legend\n",
"for i, (current, stopped) in enumerate(zip(merged_df['TableType_current'], merged_df['TableType_stopped'])):\n",
" index_label = ', '.join(map(str, merged_df.index[i]))\n",
" shortened_label = ''\n",
" marker = markers[i % len(markers)] \n",
" color = colors[i % len(colors)] \n",
" plt.scatter(current, stopped, color=color, s=100, marker=marker)\n",
" plt.annotate(shortened_label, (current, stopped), textcoords=\"offset points\", xytext=(0,10), ha='center')\n",
" labels.append(index_label)\n",
" handles.append(plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor=color, markersize=10, label=index_label))\n",
"\n",
"\n",
"plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)\n",
"plt.xlabel('Number of New Datasets')\n",
"plt.ylabel('Number of Stopped Datasets')\n",
"plt.title('New vs. Stopped Datasets by Source, State, and AgencyFull')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 ('opd')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "a73158d29711b2da05ac73de25b71e5d8cae591f14917bba77a9573b5c85a0ce"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}