Skip to content

Commit

Permalink
example #12 is done and ready for merge
Browse files Browse the repository at this point in the history
  • Loading branch information
potto216 committed Sep 30, 2024
1 parent 6992332 commit 435953c
Showing 1 changed file with 254 additions and 0 deletions.
254 changes: 254 additions & 0 deletions finding_stopped_datasets.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Finding datasets that have stopped being updated\n",
"This notebook shows how to find datasets that have stopped being updated.\n",
"It looks across all datasets and finds the datasets that have been stopped and then \n",
"does analysis with the table types to show which have the highest percentage of being stopped.\n",
"This can be extended to find for departments that have stopped releasing certain types of data, what other types are they still releasing."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import openpolicedata as opd\n",
"from openpolicedata.exceptions import OPD_DataUnavailableError\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get all datasets but exclude the openpolicing.stanford.edu to prevent duplicates\n",
"all_datasets = opd.datasets.query()\n",
"all_datasets['source_url'] = all_datasets['source_url'].astype(str)\n",
"all_datasets = all_datasets[~all_datasets['source_url'].str.contains(\"openpolicing.stanford.edu\")]\n",
"\n",
"unique_source_state_combinations = all_datasets[['SourceName', 'State']].drop_duplicates()\n",
"print(unique_source_state_combinations) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Limit the dataframe columns for analysis\n",
"selected_columns = ['State', 'SourceName', 'TableType', 'coverage_start', 'coverage_end']\n",
"filtered_df = pd.DataFrame(all_datasets, columns=selected_columns)\n",
"\n",
"filtered_df = filtered_df.dropna(subset=['coverage_start', 'coverage_end'])\n",
"\n",
"# Create entries for each year instead of a range of years. This will create more rows, but will make it easier to filter the data\n",
"filtered_df['ListOfYears'] = filtered_df.apply(\n",
" lambda row: list(range(int(row['coverage_start'].year), int(row['coverage_end'].year) + 1)), axis=1)\n",
"filtered_df = filtered_df.drop(columns=['coverage_start', 'coverage_end'])\n",
"#filtered_df = filtered_df.explode('ListOfYears')\n",
"\n",
"print(filtered_df.head())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Load data again from the file\n",
"df = filtered_df\n",
"# modify the TableType column values to remove the \" - \" and following text if it exists otherwise keep the original value\n",
"df['TableType'] = df['TableType'].str.split(' - ').str[0]\n",
"# remove all rows where grouped is None\n",
"df = df.dropna(subset=['ListOfYears'])\n",
"\n",
"# Calculate the minimum and maximum years from the combined lists\n",
"grouped=df\n",
"grouped['MinYear'] = grouped['ListOfYears'].apply(min)\n",
"grouped['MaxYear'] = grouped['ListOfYears'].apply(max)\n",
"\n",
"# Create a label for plotting\n",
"grouped['Label'] = grouped['SourceName'] + ', ' + grouped['State'] + ', ' + grouped['TableType']\n",
"\n",
"# create a horizontal bar chart where each element of the vertical axis is the unique combination of the 'SourceName', 'State', 'TableType' columns\n",
"# the horizontal axis should be the years of data available for each unique combination of 'SourceName', 'State', 'TableType'\n",
"plt.figure(figsize=(7, 60))\n",
"plt.barh(grouped['Label'], grouped['MaxYear'] - grouped['MinYear'], left=grouped['MinYear'], color='blue', height=0.8)\n",
"plt.xlabel('Year')\n",
"plt.ylabel('Source, State, Table Type')\n",
"plt.title('Years of Data Available by Source, State, and Table Type')\n",
"\n",
"for index, row in grouped.iterrows():\n",
" plt.text(row['MinYear'], index, f\"{row['MinYear']} - {row['MaxYear']}\", va='center', color='white')\n",
"\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Find the datasets where the data is most likely to be stopped within the year\n",
"current_year = 2024\n",
"minimum_tabletype_counts = 10\n",
"\n",
"grouped['MaxYear'] = grouped['MaxYear'].fillna(0).astype(int)\n",
"tabletype_counts = grouped['TableType'].value_counts()\n",
"\n",
"tabletype_counts = tabletype_counts[tabletype_counts >= minimum_tabletype_counts]\n",
"grouped = grouped[grouped['TableType'].isin(tabletype_counts.index)]\n",
"\n",
"stopped_datasets = grouped[grouped['MaxYear'] < (current_year-1)]\n",
"stopped_tabletype_counts = stopped_datasets['TableType'].value_counts()\n",
"\n",
"\n",
"# compute a bar graph histogram of the number of datasets that are stopped by TableType\n",
"fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))\n",
"\n",
"tabletype_counts.plot(kind='bar', ax=axes[0])\n",
"axes[0].set_xlabel('Table Type')\n",
"axes[0].set_ylabel('Number of Datasets')\n",
"axes[0].set_title('Number of All Datasets by Table Type')\n",
"\n",
"stopped_tabletype_counts.plot(kind='bar', ax=axes[1])\n",
"axes[1].set_xlabel('Table Type')\n",
"axes[1].set_ylabel('Number of Datasets')\n",
"axes[1].set_title('Number of Stopped Datasets by Table Type')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Find which type has the highest ratio of stopped datasets\n",
"ratio = stopped_tabletype_counts / tabletype_counts\n",
"ratio = ratio.fillna(0)\n",
"\n",
"# Create a bar plot of the ratio and sort the values from high to low\n",
"ratio = ratio.sort_values(ascending=False)\n",
"plt.figure(figsize=(10, 6))\n",
"ratio.plot(kind='bar')\n",
"plt.xlabel('Table Type')\n",
"plt.ylabel('Ratio of Stopped Datasets to All Datasets')\n",
"plt.title('Ratio of Stopped Datasets to All Datasets by Table Type')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For each unique combination of SourceName, State, and AgencyFull find the number of new datasets that have been released \n",
"# in the current year for all TableTypes\n",
"\n",
"current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year == current_year) | (all_datasets['coverage_end'].dt.year == current_year - 1)]\n",
"current_year_datasets_sum = current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull'])['TableType'].nunique()\n",
"current_year_datasets_sum = current_year_datasets_sum.sort_values(ascending=False)\n",
"\n",
"print(current_year_datasets_sum)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For each unique combination of SourceName, State, and AgencyFull find the number of stopped datasets which is \n",
"# defined as not being released this year or the previous year\n",
"stopped_current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year != current_year) | (all_datasets['coverage_end'].dt.year != current_year - 1)]\n",
"\n",
"# Group the filtered datasets by SourceName and State, and count the number of unique TableTypes\n",
"stopped_current_year_datasets_sum = stopped_current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull' ])['TableType'].nunique()\n",
"\n",
"# sort the new_datasets_count in descending order\n",
"stopped_current_year_datasets_sum = stopped_current_year_datasets_sum.sort_values(ascending=False)\n",
"\n",
"# Print the number of new datasets for each unique combination of SourceName and State\n",
"print(stopped_current_year_datasets_sum)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plot the current_year_datasets_sum and stopped_current_year_datasets_sum on a scatter chart where the x-axis is the number of new datasets\n",
"# first match only the SourceName, State, and AgencyFull that are in both current_year_datasets_sum and stopped_current_year_datasets_sum\n",
"# and the y-axis is the number of stopped datasets.\n",
"\n",
"\n",
"merged_df = pd.merge(current_year_datasets_sum, stopped_current_year_datasets_sum, \n",
" on=['SourceName', 'State', 'AgencyFull'], \n",
" suffixes=('_current', '_stopped'))\n",
"\n",
"markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']\n",
"colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'lime', 'navy', 'teal']\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"labels = []\n",
"handles = []\n",
"\n",
"# Annotate each point with the index names and collect labels for the legend\n",
"for i, (current, stopped) in enumerate(zip(merged_df['TableType_current'], merged_df['TableType_stopped'])):\n",
" index_label = ', '.join(map(str, merged_df.index[i]))\n",
" shortened_label = ''\n",
" marker = markers[i % len(markers)] \n",
" color = colors[i % len(colors)] \n",
" plt.scatter(current, stopped, color=color, s=100, marker=marker)\n",
" plt.annotate(shortened_label, (current, stopped), textcoords=\"offset points\", xytext=(0,10), ha='center')\n",
" labels.append(index_label)\n",
" handles.append(plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor=color, markersize=10, label=index_label))\n",
"\n",
"\n",
"plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)\n",
"plt.xlabel('Number of New Datasets')\n",
"plt.ylabel('Number of Stopped Datasets')\n",
"plt.title('New vs. Stopped Datasets by Source, State, and AgencyFull')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 ('opd')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "a73158d29711b2da05ac73de25b71e5d8cae591f14917bba77a9573b5c85a0ce"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 435953c

Please sign in to comment.