openpolicedata · potto216 · Sep 30, 2024
diff --git a/finding_stopped_datasets.ipynb b/finding_stopped_datasets.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finding datasets that have stopped being updated\n",
+    "This notebook shows how to find datasets that have stopped being updated.\n",
+    "It looks across all datasets and finds the datasets that have been stopped and then \n",
+    "does analysis with the table types to show which have the highest percentage of being stopped.\n",
+    "This can be extended to find for departments that have stopped releasing certain types of data, what other types are they still releasing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openpolicedata as opd\n",
+    "from openpolicedata.exceptions import OPD_DataUnavailableError\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get all datasets but exclude the openpolicing.stanford.edu to prevent duplicates\n",
+    "all_datasets = opd.datasets.query()\n",
+    "all_datasets['source_url'] = all_datasets['source_url'].astype(str)\n",
+    "all_datasets = all_datasets[~all_datasets['source_url'].str.contains(\"openpolicing.stanford.edu\")]\n",
+    "\n",
+    "unique_source_state_combinations = all_datasets[['SourceName', 'State']].drop_duplicates()\n",
+    "print(unique_source_state_combinations)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Limit the dataframe columns for analysis\n",
+    "selected_columns = ['State', 'SourceName', 'TableType', 'coverage_start', 'coverage_end']\n",
+    "filtered_df = pd.DataFrame(all_datasets, columns=selected_columns)\n",
+    "\n",
+    "filtered_df = filtered_df.dropna(subset=['coverage_start', 'coverage_end'])\n",
+    "\n",
+    "# Create entries for each year instead of a range of years. This will create more rows, but will make it easier to filter the data\n",
+    "filtered_df['ListOfYears'] = filtered_df.apply(\n",
+    "    lambda row: list(range(int(row['coverage_start'].year), int(row['coverage_end'].year) + 1)), axis=1)\n",
+    "filtered_df = filtered_df.drop(columns=['coverage_start', 'coverage_end'])\n",
+    "#filtered_df = filtered_df.explode('ListOfYears')\n",
+    "\n",
+    "print(filtered_df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Load data again from the file\n",
+    "df = filtered_df\n",
+    "# modify the TableType column values to remove the \" - \" and following text if it exists otherwise keep the original value\n",
+    "df['TableType'] = df['TableType'].str.split(' - ').str[0]\n",
+    "# remove all rows where grouped  is None\n",
+    "df = df.dropna(subset=['ListOfYears'])\n",
+    "\n",
+    "# Calculate the minimum and maximum years from the combined lists\n",
+    "grouped=df\n",
+    "grouped['MinYear'] = grouped['ListOfYears'].apply(min)\n",
+    "grouped['MaxYear'] = grouped['ListOfYears'].apply(max)\n",
+    "\n",
+    "# Create a label for plotting\n",
+    "grouped['Label'] = grouped['SourceName'] + ', ' + grouped['State'] + ', ' + grouped['TableType']\n",
+    "\n",
+    "# create a horizontal bar chart where each element of the vertical axis is the unique combination of the 'SourceName', 'State', 'TableType' columns\n",
+    "# the horizontal axis should be the years of data available for each unique combination of 'SourceName', 'State', 'TableType'\n",
+    "plt.figure(figsize=(7, 60))\n",
+    "plt.barh(grouped['Label'], grouped['MaxYear'] - grouped['MinYear'], left=grouped['MinYear'], color='blue', height=0.8)\n",
+    "plt.xlabel('Year')\n",
+    "plt.ylabel('Source, State, Table Type')\n",
+    "plt.title('Years of Data Available by Source, State, and Table Type')\n",
+    "\n",
+    "for index, row in grouped.iterrows():\n",
+    "    plt.text(row['MinYear'], index, f\"{row['MinYear']} - {row['MaxYear']}\", va='center', color='white')\n",
+    "\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find the datasets where the data is most likely to be stopped within the year\n",
+    "current_year = 2024\n",
+    "minimum_tabletype_counts = 10\n",
+    "\n",
+    "grouped['MaxYear'] = grouped['MaxYear'].fillna(0).astype(int)\n",
+    "tabletype_counts = grouped['TableType'].value_counts()\n",
+    "\n",
+    "tabletype_counts = tabletype_counts[tabletype_counts >= minimum_tabletype_counts]\n",
+    "grouped = grouped[grouped['TableType'].isin(tabletype_counts.index)]\n",
+    "\n",
+    "stopped_datasets = grouped[grouped['MaxYear'] < (current_year-1)]\n",
+    "stopped_tabletype_counts = stopped_datasets['TableType'].value_counts()\n",
+    "\n",
+    "\n",
+    "# compute a bar graph histogram of the number of datasets that are stopped by TableType\n",
+    "fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))\n",
+    "\n",
+    "tabletype_counts.plot(kind='bar', ax=axes[0])\n",
+    "axes[0].set_xlabel('Table Type')\n",
+    "axes[0].set_ylabel('Number of Datasets')\n",
+    "axes[0].set_title('Number of All Datasets by Table Type')\n",
+    "\n",
+    "stopped_tabletype_counts.plot(kind='bar', ax=axes[1])\n",
+    "axes[1].set_xlabel('Table Type')\n",
+    "axes[1].set_ylabel('Number of Datasets')\n",
+    "axes[1].set_title('Number of Stopped Datasets by Table Type')\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "# Find which type has the highest ratio of stopped datasets\n",
+    "ratio = stopped_tabletype_counts / tabletype_counts\n",
+    "ratio = ratio.fillna(0)\n",
+    "\n",
+    "# Create a bar plot of the ratio and sort the values from high to low\n",
+    "ratio = ratio.sort_values(ascending=False)\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "ratio.plot(kind='bar')\n",
+    "plt.xlabel('Table Type')\n",
+    "plt.ylabel('Ratio of Stopped Datasets to All Datasets')\n",
+    "plt.title('Ratio of Stopped Datasets to All Datasets by Table Type')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For each unique combination of SourceName, State, and AgencyFull find the number of new datasets that have been released \n",
+    "# in the current year for all TableTypes\n",
+    "\n",
+    "current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year == current_year) | (all_datasets['coverage_end'].dt.year == current_year - 1)]\n",
+    "current_year_datasets_sum = current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull'])['TableType'].nunique()\n",
+    "current_year_datasets_sum = current_year_datasets_sum.sort_values(ascending=False)\n",
+    "\n",
+    "print(current_year_datasets_sum)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For each unique combination of SourceName, State, and AgencyFull find the number of stopped datasets which is \n",
+    "# defined as not being released this year or the previous year\n",
+    "stopped_current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year != current_year) | (all_datasets['coverage_end'].dt.year != current_year - 1)]\n",
+    "\n",
+    "# Group the filtered datasets by SourceName and State, and count the number of unique TableTypes\n",
+    "stopped_current_year_datasets_sum = stopped_current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull' ])['TableType'].nunique()\n",
+    "\n",
+    "# sort the new_datasets_count in descending order\n",
+    "stopped_current_year_datasets_sum = stopped_current_year_datasets_sum.sort_values(ascending=False)\n",
+    "\n",
+    "# Print the number of new datasets for each unique combination of SourceName and State\n",
+    "print(stopped_current_year_datasets_sum)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot the current_year_datasets_sum and stopped_current_year_datasets_sum on a scatter chart where the x-axis is the number of new datasets\n",
+    "# first match only the SourceName, State, and AgencyFull that are in both current_year_datasets_sum and stopped_current_year_datasets_sum\n",
+    "# and the y-axis is the number of stopped datasets.\n",
+    "\n",
+    "\n",
+    "merged_df = pd.merge(current_year_datasets_sum, stopped_current_year_datasets_sum, \n",
+    "                     on=['SourceName', 'State', 'AgencyFull'], \n",
+    "                     suffixes=('_current', '_stopped'))\n",
+    "\n",
+    "markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']\n",
+    "colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'lime', 'navy', 'teal']\n",
+    "\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "labels = []\n",
+    "handles = []\n",
+    "\n",
+    "# Annotate each point with the index names and collect labels for the legend\n",
+    "for i, (current, stopped) in enumerate(zip(merged_df['TableType_current'], merged_df['TableType_stopped'])):\n",
+    "    index_label = ', '.join(map(str, merged_df.index[i]))\n",
+    "    shortened_label = ''\n",
+    "    marker = markers[i % len(markers)]  \n",
+    "    color = colors[i % len(colors)] \n",
+    "    plt.scatter(current, stopped, color=color, s=100, marker=marker)\n",
+    "    plt.annotate(shortened_label, (current, stopped), textcoords=\"offset points\", xytext=(0,10), ha='center')\n",
+    "    labels.append(index_label)\n",
+    "    handles.append(plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor=color, markersize=10, label=index_label))\n",
+    "\n",
+    "\n",
+    "plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)\n",
+    "plt.xlabel('Number of New Datasets')\n",
+    "plt.ylabel('Number of Stopped Datasets')\n",
+    "plt.title('New vs. Stopped Datasets by Source, State, and AgencyFull')\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('opd')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "a73158d29711b2da05ac73de25b71e5d8cae591f14917bba77a9573b5c85a0ce"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}