From 435953c34d7dbf5e7cf0f79413b26f6894713b60 Mon Sep 17 00:00:00 2001 From: Paul Otto Date: Mon, 30 Sep 2024 03:37:28 -0700 Subject: [PATCH] example #12 is done and ready for merge --- finding_stopped_datasets.ipynb | 254 +++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 finding_stopped_datasets.ipynb diff --git a/finding_stopped_datasets.ipynb b/finding_stopped_datasets.ipynb new file mode 100644 index 0000000..2bfe54d --- /dev/null +++ b/finding_stopped_datasets.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Finding datasets that have stopped being updated\n", + "This notebook shows how to find datasets that have stopped being updated.\n", + "It looks across all datasets and finds the datasets that have been stopped and then \n", + "does analysis with the table types to show which have the highest percentage of being stopped.\n", + "This can be extended to find for departments that have stopped releasing certain types of data, what other types are they still releasing." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import openpolicedata as opd\n", + "from openpolicedata.exceptions import OPD_DataUnavailableError\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all datasets but exclude the openpolicing.stanford.edu to prevent duplicates\n", + "all_datasets = opd.datasets.query()\n", + "all_datasets['source_url'] = all_datasets['source_url'].astype(str)\n", + "all_datasets = all_datasets[~all_datasets['source_url'].str.contains(\"openpolicing.stanford.edu\")]\n", + "\n", + "unique_source_state_combinations = all_datasets[['SourceName', 'State']].drop_duplicates()\n", + "print(unique_source_state_combinations) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Limit the dataframe columns for analysis\n", + "selected_columns = ['State', 'SourceName', 'TableType', 'coverage_start', 'coverage_end']\n", + "filtered_df = pd.DataFrame(all_datasets, columns=selected_columns)\n", + "\n", + "filtered_df = filtered_df.dropna(subset=['coverage_start', 'coverage_end'])\n", + "\n", + "# Create entries for each year instead of a range of years. This will create more rows, but will make it easier to filter the data\n", + "filtered_df['ListOfYears'] = filtered_df.apply(\n", + " lambda row: list(range(int(row['coverage_start'].year), int(row['coverage_end'].year) + 1)), axis=1)\n", + "filtered_df = filtered_df.drop(columns=['coverage_start', 'coverage_end'])\n", + "#filtered_df = filtered_df.explode('ListOfYears')\n", + "\n", + "print(filtered_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Load data again from the file\n", + "df = filtered_df\n", + "# modify the TableType column values to remove the \" - \" and following text if it exists otherwise keep the original value\n", + "df['TableType'] = df['TableType'].str.split(' - ').str[0]\n", + "# remove all rows where grouped is None\n", + "df = df.dropna(subset=['ListOfYears'])\n", + "\n", + "# Calculate the minimum and maximum years from the combined lists\n", + "grouped=df\n", + "grouped['MinYear'] = grouped['ListOfYears'].apply(min)\n", + "grouped['MaxYear'] = grouped['ListOfYears'].apply(max)\n", + "\n", + "# Create a label for plotting\n", + "grouped['Label'] = grouped['SourceName'] + ', ' + grouped['State'] + ', ' + grouped['TableType']\n", + "\n", + "# create a horizontal bar chart where each element of the vertical axis is the unique combination of the 'SourceName', 'State', 'TableType' columns\n", + "# the horizontal axis should be the years of data available for each unique combination of 'SourceName', 'State', 'TableType'\n", + "plt.figure(figsize=(7, 60))\n", + "plt.barh(grouped['Label'], grouped['MaxYear'] - grouped['MinYear'], left=grouped['MinYear'], color='blue', height=0.8)\n", + "plt.xlabel('Year')\n", + "plt.ylabel('Source, State, Table Type')\n", + "plt.title('Years of Data Available by Source, State, and Table Type')\n", + "\n", + "for index, row in grouped.iterrows():\n", + " plt.text(row['MinYear'], index, f\"{row['MinYear']} - {row['MaxYear']}\", va='center', color='white')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Find the datasets where the data is most likely to be stopped within the year\n", + "current_year = 2024\n", + "minimum_tabletype_counts = 10\n", + "\n", + "grouped['MaxYear'] = grouped['MaxYear'].fillna(0).astype(int)\n", + "tabletype_counts = grouped['TableType'].value_counts()\n", + "\n", + "tabletype_counts = tabletype_counts[tabletype_counts >= minimum_tabletype_counts]\n", + "grouped = grouped[grouped['TableType'].isin(tabletype_counts.index)]\n", + "\n", + "stopped_datasets = grouped[grouped['MaxYear'] < (current_year-1)]\n", + "stopped_tabletype_counts = stopped_datasets['TableType'].value_counts()\n", + "\n", + "\n", + "# compute a bar graph histogram of the number of datasets that are stopped by TableType\n", + "fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))\n", + "\n", + "tabletype_counts.plot(kind='bar', ax=axes[0])\n", + "axes[0].set_xlabel('Table Type')\n", + "axes[0].set_ylabel('Number of Datasets')\n", + "axes[0].set_title('Number of All Datasets by Table Type')\n", + "\n", + "stopped_tabletype_counts.plot(kind='bar', ax=axes[1])\n", + "axes[1].set_xlabel('Table Type')\n", + "axes[1].set_ylabel('Number of Datasets')\n", + "axes[1].set_title('Number of Stopped Datasets by Table Type')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Find which type has the highest ratio of stopped datasets\n", + "ratio = stopped_tabletype_counts / tabletype_counts\n", + "ratio = ratio.fillna(0)\n", + "\n", + "# Create a bar plot of the ratio and sort the values from high to low\n", + "ratio = ratio.sort_values(ascending=False)\n", + "plt.figure(figsize=(10, 6))\n", + "ratio.plot(kind='bar')\n", + "plt.xlabel('Table Type')\n", + "plt.ylabel('Ratio of Stopped Datasets to All Datasets')\n", + "plt.title('Ratio of Stopped Datasets to All Datasets by Table Type')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For each unique combination of SourceName, State, and AgencyFull find the number of new datasets that have been released \n", + "# in the current year for all TableTypes\n", + "\n", + "current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year == current_year) | (all_datasets['coverage_end'].dt.year == current_year - 1)]\n", + "current_year_datasets_sum = current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull'])['TableType'].nunique()\n", + "current_year_datasets_sum = current_year_datasets_sum.sort_values(ascending=False)\n", + "\n", + "print(current_year_datasets_sum)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For each unique combination of SourceName, State, and AgencyFull find the number of stopped datasets which is \n", + "# defined as not being released this year or the previous year\n", + "stopped_current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year != current_year) | (all_datasets['coverage_end'].dt.year != current_year - 1)]\n", + "\n", + "# Group the filtered datasets by SourceName and State, and count the number of unique TableTypes\n", + "stopped_current_year_datasets_sum = stopped_current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull' ])['TableType'].nunique()\n", + "\n", + "# sort the new_datasets_count in descending order\n", + "stopped_current_year_datasets_sum = stopped_current_year_datasets_sum.sort_values(ascending=False)\n", + "\n", + "# Print the number of new datasets for each unique combination of SourceName and State\n", + "print(stopped_current_year_datasets_sum)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the current_year_datasets_sum and stopped_current_year_datasets_sum on a scatter chart where the x-axis is the number of new datasets\n", + "# first match only the SourceName, State, and AgencyFull that are in both current_year_datasets_sum and stopped_current_year_datasets_sum\n", + "# and the y-axis is the number of stopped datasets.\n", + "\n", + "\n", + "merged_df = pd.merge(current_year_datasets_sum, stopped_current_year_datasets_sum, \n", + " on=['SourceName', 'State', 'AgencyFull'], \n", + " suffixes=('_current', '_stopped'))\n", + "\n", + "markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']\n", + "colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'lime', 'navy', 'teal']\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "labels = []\n", + "handles = []\n", + "\n", + "# Annotate each point with the index names and collect labels for the legend\n", + "for i, (current, stopped) in enumerate(zip(merged_df['TableType_current'], merged_df['TableType_stopped'])):\n", + " index_label = ', '.join(map(str, merged_df.index[i]))\n", + " shortened_label = ''\n", + " marker = markers[i % len(markers)] \n", + " color = colors[i % len(colors)] \n", + " plt.scatter(current, stopped, color=color, s=100, marker=marker)\n", + " plt.annotate(shortened_label, (current, stopped), textcoords=\"offset points\", xytext=(0,10), ha='center')\n", + " labels.append(index_label)\n", + " handles.append(plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor=color, markersize=10, label=index_label))\n", + "\n", + "\n", + "plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)\n", + "plt.xlabel('Number of New Datasets')\n", + "plt.ylabel('Number of Stopped Datasets')\n", + "plt.title('New vs. Stopped Datasets by Source, State, and AgencyFull')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.12 ('opd')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a73158d29711b2da05ac73de25b71e5d8cae591f14917bba77a9573b5c85a0ce" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}