From 5eb6eaa160b41396ae34a7b68d6b48b206008ecc Mon Sep 17 00:00:00 2001 From: archenclan Date: Sun, 4 Dec 2022 16:12:08 -0800 Subject: [PATCH] Completed sub puzzle #4 --- lec/lec04/04-pandas-ii.ipynb | 8689 +++++++++++----------------------- 1 file changed, 2655 insertions(+), 6034 deletions(-) diff --git a/lec/lec04/04-pandas-ii.ipynb b/lec/lec04/04-pandas-ii.ipynb index 743d770..e83783f 100644 --- a/lec/lec04/04-pandas-ii.ipynb +++ b/lec/lec04/04-pandas-ii.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": { "id": "uFRWFwzgeUGP" }, @@ -34,10 +34,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": { "id": "dom645G0eUGR", - "outputId": "d7f9171f-dc7d-41d1-ef8a-511634276fae", + "outputId": "d749cb9a-3945-413e-cb8d-d8d50b543b88", "colab": { "base_uri": "https://localhost:8080/", "height": 204 @@ -48,16 +48,16 @@ "output_type": "execute_result", "data": { "text/plain": [ - " State Sex Year Name Count\n", - "393223 CA M 2019 Yoel 23\n", - "207521 CA F 2014 Venice 13\n", - "232729 CA F 2021 Kenia 59\n", - "281058 CA M 1971 Lonnie 69\n", - "381469 CA M 2015 Alexandro 29" + " State Sex Year Name Count\n", + "391597 CA M 2018 Shiva 7\n", + "49269 CA F 1963 Katherin 5\n", + "328661 CA M 1995 Jacoby 5\n", + "20495 CA F 1943 Lolita 6\n", + "29936 CA F 1951 Jene 5" ], "text/html": [ "\n", - "
\n", + "
\n", "
\n", "
\n", "\n", + "\n", + " \n", + "
\n", + "
\n", + " " ] }, - "execution_count": 39, "metadata": {}, - "output_type": "execute_result" + "execution_count": 35 } ], "source": [ @@ -3878,82 +3973,43 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OkLXdmeBeUGs", - "outputId": "da460e4a-9232-48c5-ce54-2839c84b0f30" - }, - "outputs": [ - { - "ename": "IndexError", - "evalue": "single positional indexer is out-of-bounds", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m babynames[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39munique()[\u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m100\u001b[39m]:\n\u001b[1;32m 5\u001b[0m counts_of_current_name \u001b[38;5;241m=\u001b[39m female_babynames[female_babynames[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m==\u001b[39m name][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCount\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m----> 6\u001b[0m rtps[name] \u001b[38;5;241m=\u001b[39m \u001b[43mratio_to_peak\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcounts_of_current_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m#convert to series\u001b[39;00m\n\u001b[1;32m 9\u001b[0m rtps \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mSeries(rtps) \n", - "Input \u001b[0;32mIn [31]\u001b[0m, in \u001b[0;36mratio_to_peak\u001b[0;34m(series)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mratio_to_peak\u001b[39m(series):\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mseries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mmax\u001b[39m(series)\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/indexing.py:931\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 928\u001b[0m axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxis \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 930\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mapply_if_callable(key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj)\n\u001b[0;32m--> 931\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmaybe_callable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/indexing.py:1566\u001b[0m, in \u001b[0;36m_iLocIndexer._getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1563\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot index by location index with a non-integer key\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1565\u001b[0m \u001b[38;5;66;03m# validate the location\u001b[39;00m\n\u001b[0;32m-> 1566\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_integer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1568\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39m_ixs(key, axis\u001b[38;5;241m=\u001b[39maxis)\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/indexing.py:1500\u001b[0m, in \u001b[0;36m_iLocIndexer._validate_integer\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1498\u001b[0m len_axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39m_get_axis(axis))\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m len_axis \u001b[38;5;129;01mor\u001b[39;00m key \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m-\u001b[39mlen_axis:\n\u001b[0;32m-> 1500\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msingle positional indexer is out-of-bounds\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mIndexError\u001b[0m: single positional indexer is out-of-bounds" - ] - } - ], - "source": [ - "#build dictionary where entry i is the ammd for the given name\n", - "#e.g. rtps[\"jennifer\"] should be 0.0231\n", - "rtps = {}\n", - "for name in babynames[\"Name\"].unique()[0:100]:\n", - " counts_of_current_name = female_babynames[female_babynames[\"Name\"] == name][\"Count\"]\n", - " rtps[name] = ratio_to_peak(counts_of_current_name)\n", - " \n", - "#convert to series\n", - "rtps = pd.Series(rtps) \n", - "rtps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "m52Ru4SweUGs" - }, - "outputs": [], - "source": [ - "rtps.sort_values()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hYJkHlfqeUGt" - }, - "source": [ - "### Approach 2: Use groupby.agg" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nX83qPOgeUGv" - }, "source": [ - "Instead, we can use the very powerful groupby.agg operation, which allows us to simply and efficiently compute what we want." - ] - }, - { - "cell_type": "code", - "execution_count": null, + "female_babynames" + ], "metadata": { - "id": "gCO8S8yTeUGv", - "outputId": "6a69c674-6dac-4e67-c6f8-e2b19a9d1a95" + "id": "w7utWcCMbBV6", + "outputId": "c9c152a2-3fff-49be-84fc-c79191c5138c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 419 + } }, + "execution_count": 39, "outputs": [ { + "output_type": "execute_result", "data": { + "text/plain": [ + " State Sex Year Name Count nl\n", + "46 CA F 1910 Jessie 32 6\n", + "171 CA F 1910 Daisy 7 5\n", + "8 CA F 1910 Virginia 101 8\n", + "80 CA F 1910 Vivian 22 6\n", + "63 CA F 1910 Charlotte 26 9\n", + "... ... .. ... ... ... ..\n", + "234616 CA F 2021 Jazlene 8 7\n", + "233820 CA F 2021 Milania 14 7\n", + "233823 CA F 2021 Monique 14 7\n", + "234869 CA F 2021 Lovelyn 7 7\n", + "232463 CA F 2021 Capri 108 5\n", + "\n", + "[235791 rows x 6 columns]" + ], "text/html": [ - "
\n", + "\n", + "
\n", + "
\n", + "
\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -4125,147 +4088,347 @@ " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
StateSexYearNameCount
131013CAF1994Leandrea5
115942CAF1990Deandrea5
1087158CAF1988Deandrea51910Virginia1018
10196280CAF1986Deandrea1910Vivian226
21247463CAF2015Leandra71910Charlotte269
...............
79750234616CAF1978Kia92021Jazlene87
165624233820CAF2004Kai232021Milania147
59532233823CAF1969Lia252021Monique147
59529234869CAF1969Kay252021Lovelyn77
44283232463CAF1961Dee702021Capri1085
\n", - "

232102 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " State Sex Year Name Count\n", - "131013 CA F 1994 Leandrea 5\n", - "115942 CA F 1990 Deandrea 5\n", - "108715 CA F 1988 Deandrea 5\n", - "101962 CA F 1986 Deandrea 6\n", - "212474 CA F 2015 Leandra 7\n", - "... ... .. ... ... ...\n", - "79750 CA F 1978 Kia 9\n", - "165624 CA F 2004 Kai 23\n", - "59532 CA F 1969 Lia 25\n", - "59529 CA F 1969 Kay 25\n", - "44283 CA F 1961 Dee 70\n", - "\n", - "[232102 rows x 5 columns]" + "

235791 rows × 6 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ] }, - "execution_count": 41, "metadata": {}, - "output_type": "execute_result" + "execution_count": 39 } - ], - "source": [ - "female_babynames" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { - "scrolled": true, - "id": "gMf9w8gleUGw", - "outputId": "0738bee5-237e-4b5d-f500-2b0c04ad9322" + "id": "OkLXdmeBeUGs", + "outputId": "0a728b7d-cf30-43c7-d5ec-dbef4d610cb8", + "colab": { + "base_uri": "https://localhost:8080/" + } }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ - "Name\n", - "Aadhira 2018\n", - "Aadhira 2017\n", - "Aadhira 2020\n", - "Name: Year, dtype: int64" + "Jessie 0.462585\n", + "Daisy 0.269391\n", + "Virginia 0.034395\n", + "Vivian 0.586035\n", + "Charlotte 0.845932\n", + "Agnes 0.172043\n", + "Edna 0.037736\n", + "Sadie 0.679803\n", + "Florence 0.167785\n", + "Emily 0.365632\n", + "Helene 0.166667\n", + "Patricia 0.005640\n", + "Katherine 0.183217\n", + "Merle 0.291667\n", + "Angie 0.245833\n", + "Celia 0.362745\n", + "Marguerite 0.068182\n", + "Dora 0.138686\n", + "Kathryn 0.045760\n", + "Gloria 0.071429\n", + "Amy 0.105972\n", + "Jean 0.023913\n", + "Jacqueline 0.080717\n", + "Cecile 0.250000\n", + "Viola 0.185567\n", + "Sally 0.050526\n", + "Alyce 0.318182\n", + "Minnie 0.243902\n", + "Laura 0.040357\n", + "Vera 0.806202\n", + "Mabel 0.759036\n", + "Rosemary 0.284133\n", + "Jeannette 0.044776\n", + "Esther 0.479554\n", + "Margarita 0.101167\n", + "Janet 0.010283\n", + "Willie 0.217391\n", + "Amelia 0.920029\n", + "Aileen 0.421875\n", + "Ethel 0.046512\n", + "Elinor 0.285714\n", + "Rosalie 0.595652\n", + "Joyce 0.071023\n", + "Adeline 0.637066\n", + "Della 0.200000\n", + "Verna 0.086207\n", + "Althea 0.357143\n", + "Josephine 0.820261\n", + "Elaine 0.181208\n", + "Katie 0.091503\n", + "dtype: float64" ] }, - "execution_count": 54, "metadata": {}, - "output_type": "execute_result" + "execution_count": 40 } ], "source": [ - "rtp_table = female_babynames.groupby(\"Name\").agg('count')\n", - "female_babynames.set_index('Name').loc['Aadhira', :]['Year']\n", - "\n", - "#Note: If this cell crashes, comment out the code and use the female_babynames.groupby(\"Name\")[[\"Count\"]].agg(ratio_to_peak) instead" + "#build dictionary where entry i is the ammd for the given name\n", + "#e.g. rtps[\"jennifer\"] should be 0.0231\n", + "rtps = {}\n", + "for name in female_babynames[\"Name\"].unique()[0:50]:\n", + " counts_of_current_name = female_babynames[female_babynames[\"Name\"] == name][\"Count\"]\n", + " rtps[name] = ratio_to_peak(counts_of_current_name)\n", + " \n", + "#convert to series\n", + "rtps = pd.Series(rtps) \n", + "rtps" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 41, "metadata": { - "id": "OFnccf_5eUGw" + "id": "m52Ru4SweUGs", + "outputId": "7e9fd6b8-a795-4e24-f662-e0726ec06f21", + "colab": { + "base_uri": "https://localhost:8080/" + } }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Patricia 0.005640\n", + "Janet 0.010283\n", + "Jean 0.023913\n", + "Virginia 0.034395\n", + "Edna 0.037736\n", + "Laura 0.040357\n", + "Jeannette 0.044776\n", + "Kathryn 0.045760\n", + "Ethel 0.046512\n", + "Sally 0.050526\n", + "Marguerite 0.068182\n", + "Joyce 0.071023\n", + "Gloria 0.071429\n", + "Jacqueline 0.080717\n", + "Verna 0.086207\n", + "Katie 0.091503\n", + "Margarita 0.101167\n", + "Amy 0.105972\n", + "Dora 0.138686\n", + "Helene 0.166667\n", + "Florence 0.167785\n", + "Agnes 0.172043\n", + "Elaine 0.181208\n", + "Katherine 0.183217\n", + "Viola 0.185567\n", + "Della 0.200000\n", + "Willie 0.217391\n", + "Minnie 0.243902\n", + "Angie 0.245833\n", + "Cecile 0.250000\n", + "Daisy 0.269391\n", + "Rosemary 0.284133\n", + "Elinor 0.285714\n", + "Merle 0.291667\n", + "Alyce 0.318182\n", + "Althea 0.357143\n", + "Celia 0.362745\n", + "Emily 0.365632\n", + "Aileen 0.421875\n", + "Jessie 0.462585\n", + "Esther 0.479554\n", + "Vivian 0.586035\n", + "Rosalie 0.595652\n", + "Adeline 0.637066\n", + "Sadie 0.679803\n", + "Mabel 0.759036\n", + "Vera 0.806202\n", + "Josephine 0.820261\n", + "Charlotte 0.845932\n", + "Amelia 0.920029\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ], "source": [ - "This is simply the equivalent of [http://data8.org/datascience/_autosummary/datascience.tables.Table.group.html](http://data8.org/datascience/_autosummary/datascience.tables.Table.group.html) from Data8, e.g. if babynames were using `Table`, our code would read:\n", - "\n", - "`female_babynames.group(\"Name\", ratio_to_peak)`\n", - "\n", - "For a visual review of groupby, see this [lecture slide](https://docs.google.com/presentation/d/1FC-cs5MTGSkDzI_7R_ZENgwoHQ4aVamxFOpJuWT0fo0/edit#slide=id.g477ed0f02e_0_390)." + "rtps.sort_values()" ] }, { "cell_type": "markdown", "metadata": { - "id": "eDeUYl-leUGx" + "id": "hYJkHlfqeUGt" }, "source": [ - "If you're using an early enough version of pandas, the code above will not crash, and will automatically drop columns for which `ratio_to_peak` fails, e.g. the Sex column. \n", - "\n", - "However, according to a warning message that is generated as of January 2022, at some point this code will no longer be considered valid Pandas code, and the code will crash on columns for which the aggregation function is undefined. \n", - "\n", - "Whether we're trying to avoid a crash, or just want a clean DataFrame, let's explicitly select only the Count column. The idea here is that we don't really care about the meaningless Year column, so we may as well exclude it when we compute our `ratio_to_peak` values. " + "### Approach 2: Use groupby.agg" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nX83qPOgeUGv" + }, + "source": [ + "Instead, we can use the very powerful groupby.agg operation, which allows us to simply and efficiently compute what we want." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": { - "id": "0MfZc-ZTeUGx", - "outputId": "5cbaa632-af1b-4b8f-874d-6a8f22e90c79" + "id": "gCO8S8yTeUGv", + "outputId": "21c03c70-9c83-4e59-a2b5-63f7cd856ae0", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } }, "outputs": [ { + "output_type": "execute_result", "data": { + "text/plain": [ + " State Sex Year Name Count nl\n", + "46 CA F 1910 Jessie 32 6\n", + "171 CA F 1910 Daisy 7 5\n", + "8 CA F 1910 Virginia 101 8\n", + "80 CA F 1910 Vivian 22 6\n", + "63 CA F 1910 Charlotte 26 9" + ], "text/html": [ - "
\n", + "\n", + "
\n", + "
\n", + "
\n", "\n", + "\n", + " \n", + "
\n", + "
\n", + " " ] }, - "execution_count": 43, "metadata": {}, - "output_type": "execute_result" + "execution_count": 42 } ], "source": [ - "rtp_table = female_babynames.groupby(\"Name\")[[\"Count\"]].agg(ratio_to_peak)\n", - "rtp_table" + "female_babynames.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": { - "id": "NfRBnifheUGx", - "outputId": "ec1ab2f5-09a4-4368-a3f2-da62b859be9b" + "id": "FhnVSqW0eUGw", + "outputId": "1a69887e-55ff-460a-ffdb-877559c26b18", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 419 + } }, "outputs": [ { + "output_type": "execute_result", "data": { + "text/plain": [ + " State Sex Year Name Count nl\n", + "46 CA F 1910 Jessie 32 6\n", + "171 CA F 1910 Daisy 7 5\n", + "8 CA F 1910 Virginia 101 8\n", + "80 CA F 1910 Vivian 22 6\n", + "63 CA F 1910 Charlotte 26 9\n", + "... ... .. ... ... ... ..\n", + "234616 CA F 2021 Jazlene 8 7\n", + "233820 CA F 2021 Milania 14 7\n", + "233823 CA F 2021 Monique 14 7\n", + "234869 CA F 2021 Lovelyn 7 7\n", + "232463 CA F 2021 Capri 108 5\n", + "\n", + "[235791 rows x 6 columns]" + ], "text/html": [ - "
\n", + "\n", + "
\n", + "
\n", + "
\n", "\n", + "\n", + " \n", + "
\n", + "
\n", + " " ] }, - "execution_count": 44, "metadata": {}, - "output_type": "execute_result" + "execution_count": 43 } ], "source": [ - "# this code renames the Count column to RTP. You'll see this syntax in lab\n", - "rtp_table = rtp_table.rename(columns = {\"Count\": \"Count RTP\"})\n", - "rtp_table" + "female_babynames" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": { - "id": "OpXQkSHpeUGy", - "outputId": "8d935b97-8ce5-4529-b344-104d6daaf04e" + "scrolled": true, + "id": "gMf9w8gleUGw", + "outputId": "2df01bca-fda7-4560-b869-2c61bbea1ed5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 450 + } }, "outputs": [ { + "output_type": "execute_result", "data": { + "text/plain": [ + " State Sex Year Count nl\n", + "Name \n", + "Aadhira 4 4 4 4 4\n", + "Aadhya 13 13 13 13 13\n", + "Aadya 14 14 14 14 14\n", + "Aahana 13 13 13 13 13\n", + "Aahna 1 1 1 1 1\n", + "... ... ... ... ... ..\n", + "Zyanya 22 22 22 22 22\n", + "Zyla 12 12 12 12 12\n", + "Zylah 11 11 11 11 11\n", + "Zyra 10 10 10 10 10\n", + "Zyrah 4 4 4 4 4\n", + "\n", + "[13661 rows x 5 columns]" + ], "text/html": [ - "
\n", + "\n", + "
\n", + "
\n", + "
\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Count
Name
Aadhira22
Aadhya368
Aadya230
Aahana129
Aahna7
......
Zyanya174
Zyla107
Zylah95
Zyra71
Zyrah21
\n", - "

13525 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Count\n", - "Name \n", - "Aadhira 22\n", - "Aadhya 368\n", - "Aadya 230\n", - "Aahana 129\n", - "Aahna 7\n", - "... ...\n", - "Zyanya 174\n", - "Zyla 107\n", - "Zylah 95\n", - "Zyra 71\n", - "Zyrah 21\n", - "\n", - "[13525 rows x 1 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "puzzle1 = female_babynames.groupby(\"Name\")[[\"Count\"]].agg(sum)\n", - "puzzle1" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eyEaAiWaeUG1" - }, - "source": [ - "Groupby puzzle #2: Try to create a groupby.agg call that gives total babies born in each year." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ArWLhQFheUG1", - "outputId": "cba8b46a-9d55-4263-d0cd-6212911a4853" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Count
Year
19105950
19116602
19129804
191311860
191413815
......
2016203360
2017195352
2018188919
2019183644
2020172003
\n", - "

111 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Count\n", - "Year \n", - "1910 5950\n", - "1911 6602\n", - "1912 9804\n", - "1913 11860\n", - "1914 13815\n", - "... ...\n", - "2016 203360\n", - "2017 195352\n", - "2018 188919\n", - "2019 183644\n", - "2020 172003\n", - "\n", - "[111 rows x 1 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "puzzle2 = female_babynames.groupby(\"Year\")[[\"Count\"]].agg(sum)\n", - "puzzle2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0WciSugSeUG2" - }, - "source": [ - "We can write this using a groupby shorthand aggregation method. Here, `sum()` is shorthand for `groupby.agg(sum)`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uiE1DcO6eUG2" - }, - "outputs": [], - "source": [ - "puzzle2 = female_babynames.groupby(\"Year\")[[\"Count\"]].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bjcV97opeUG2", - "outputId": "01fa0438-467f-4d90-e0ce-7384c2c7f123" - }, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "hovertemplate": "Year=%{x}
Count=%{y}", - "legendgroup": "", - "line": { - "color": "#636efa", - "dash": "solid" - }, - "marker": { - "symbol": "circle" - }, - "mode": "lines", - "name": "", - "orientation": "v", - "showlegend": false, - "type": "scatter", - "x": [ - 1910, - 1911, - 1912, - 1913, - 1914, - 1915, - 1916, - 1917, - 1918, - 1919, - 1920, - 1921, - 1922, - 1923, - 1924, - 1925, - 1926, - 1927, - 1928, - 1929, - 1930, - 1931, - 1932, - 1933, - 1934, - 1935, - 1936, - 1937, - 1938, - 1939, - 1940, - 1941, - 1942, - 1943, - 1944, - 1945, - 1946, - 1947, - 1948, - 1949, - 1950, - 1951, - 1952, - 1953, - 1954, - 1955, - 1956, - 1957, - 1958, - 1959, - 1960, - 1961, - 1962, - 1963, - 1964, - 1965, - 1966, - 1967, - 1968, - 1969, - 1970, - 1971, - 1972, - 1973, - 1974, - 1975, - 1976, - 1977, - 1978, - 1979, - 1980, - 1981, - 1982, - 1983, - 1984, - 1985, - 1986, - 1987, - 1988, - 1989, - 1990, - 1991, - 1992, - 1993, - 1994, - 1995, - 1996, - 1997, - 1998, - 1999, - 2000, - 2001, - 2002, - 2003, - 2004, - 2005, - 2006, - 2007, - 2008, - 2009, - 2010, - 2011, - 2012, - 2013, - 2014, - 2015, - 2016, - 2017, - 2018, - 2019, - 2020 - ], - "xaxis": "x", - "y": [ - 5950, - 6602, - 9804, - 11860, - 13815, - 18643, - 19555, - 20864, - 23052, - 23290, - 28136, - 30479, - 30917, - 34122, - 37892, - 36915, - 36125, - 36784, - 36752, - 35732, - 37249, - 35102, - 34227, - 32892, - 34211, - 34917, - 36814, - 40591, - 44515, - 45620, - 49921, - 55732, - 69019, - 77970, - 79835, - 83505, - 99287, - 112884, - 111415, - 113221, - 113544, - 120926, - 131348, - 138849, - 143498, - 146169, - 156429, - 163942, - 162923, - 166298, - 172632, - 175044, - 173455, - 174104, - 171883, - 161654, - 153735, - 152925, - 153795, - 159870, - 164020, - 148397, - 137064, - 130778, - 136622, - 138439, - 143866, - 149564, - 153319, - 163311, - 173691, - 181276, - 185722, - 187691, - 192577, - 202218, - 207649, - 215088, - 228380, - 243970, - 262380, - 261484, - 256727, - 249565, - 242460, - 234569, - 229760, - 222828, - 220983, - 219039, - 223782, - 222243, - 222689, - 226609, - 228753, - 230344, - 234687, - 236172, - 229229, - 218201, - 210871, - 207494, - 208926, - 204939, - 209416, - 205093, - 203360, - 195352, - 188919, - 183644, - 172003 - ], - "yaxis": "y" - } - ], - "layout": { - "autosize": true, - "font": { - "size": 15 - }, - "legend": { - "tracegroupgap": 0 - }, - "margin": { - "t": 60 - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "xaxis": { - "anchor": "y", - "autorange": true, - "domain": [ - 0, - 1 - ], - "range": [ - 1910, - 2020 - ], - "title": { - "text": "Year" - }, - "type": "linear" - }, - "yaxis": { - "anchor": "x", - "autorange": true, - "domain": [ - 0, - 1 - ], - "range": [ - -8296.111111111111, - 276626.1111111111 - ], - "title": { - "text": "Count" - }, - "type": "linear" - } - } - }, - "image/png": "", + "

13661 rows × 5 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 46 + } + ], + "source": [ + "rtp_table = female_babynames.groupby(\"Name\").agg('count')\n", + "female_babynames.set_index('Name').loc['Aadhira', :]['Year']\n", + "\n", + "female_babynames.groupby(\"Name\").agg('count')\n", + "\n", + "#Note: If this cell crashes, comment out the code and use the female_babynames.groupby(\"Name\")[[\"Count\"]].agg(ratio_to_peak) instead" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OFnccf_5eUGw" + }, + "source": [ + "This is simply the equivalent of [http://data8.org/datascience/_autosummary/datascience.tables.Table.group.html](http://data8.org/datascience/_autosummary/datascience.tables.Table.group.html) from Data8, e.g. if babynames were using `Table`, our code would read:\n", + "\n", + "`female_babynames.group(\"Name\", ratio_to_peak)`\n", + "\n", + "For a visual review of groupby, see this [lecture slide](https://docs.google.com/presentation/d/1FC-cs5MTGSkDzI_7R_ZENgwoHQ4aVamxFOpJuWT0fo0/edit#slide=id.g477ed0f02e_0_390)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eDeUYl-leUGx" + }, + "source": [ + "If you're using an early enough version of pandas, the code above will not crash, and will automatically drop columns for which `ratio_to_peak` fails, e.g. the Sex column. \n", + "\n", + "However, according to a warning message that is generated as of January 2022, at some point this code will no longer be considered valid Pandas code, and the code will crash on columns for which the aggregation function is undefined. \n", + "\n", + "Whether we're trying to avoid a crash, or just want a clean DataFrame, let's explicitly select only the Count column. The idea here is that we don't really care about the meaningless Year column, so we may as well exclude it when we compute our `ratio_to_peak` values. " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "0MfZc-ZTeUGx", + "outputId": "c7e46052-6556-45bf-e52c-d1d7c9bb22ba", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 450 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Count\n", + "Name \n", + "Aadhira 0.700000\n", + "Aadhya 0.580000\n", + "Aadya 0.724138\n", + "Aahana 0.192308\n", + "Aahna 1.000000\n", + "... ...\n", + "Zyanya 0.857143\n", + "Zyla 1.000000\n", + "Zylah 1.000000\n", + "Zyra 1.000000\n", + "Zyrah 0.833333\n", + "\n", + "[13661 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Count
Name
Aadhira0.700000
Aadhya0.580000
Aadya0.724138
Aahana0.192308
Aahna1.000000
......
Zyanya0.857143
Zyla1.000000
Zylah1.000000
Zyra1.000000
Zyrah0.833333
\n", + "

13661 rows × 1 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 50 + } + ], + "source": [ + "rtp_table = female_babynames.groupby(\"Name\")[[\"Count\"]].agg(ratio_to_peak)\n", + "rtp_table" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "id": "NfRBnifheUGx", + "outputId": "ea699835-d4a7-48ed-f938-762f87b61513", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 450 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Count RTP\n", + "Name \n", + "Aadhira 0.700000\n", + "Aadhya 0.580000\n", + "Aadya 0.724138\n", + "Aahana 0.192308\n", + "Aahna 1.000000\n", + "... ...\n", + "Zyanya 0.857143\n", + "Zyla 1.000000\n", + "Zylah 1.000000\n", + "Zyra 1.000000\n", + "Zyrah 0.833333\n", + "\n", + "[13661 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Count RTP
Name
Aadhira0.700000
Aadhya0.580000
Aadya0.724138
Aahana0.192308
Aahna1.000000
......
Zyanya0.857143
Zyla1.000000
Zylah1.000000
Zyra1.000000
Zyrah0.833333
\n", + "

13661 rows × 1 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 51 + } + ], + "source": [ + "# this code renames the Count column to RTP. You'll see this syntax in lab\n", + "rtp_table = rtp_table.rename(columns = {\"Count\": \"Count RTP\"})\n", + "rtp_table" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "id": "OpXQkSHpeUGy", + "outputId": "8deed9c0-d6e8-402b-e123-c553d6fe404d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 450 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Count RTP\n", + "Name \n", + "Debra 0.001260\n", + "Susan 0.002034\n", + "Debbie 0.002817\n", + "Cheryl 0.003273\n", + "Carol 0.003635\n", + "... ...\n", + "Jovi 1.000000\n", + "Neta 1.000000\n", + "Doni 1.000000\n", + "Dondi 1.000000\n", + "Kela 1.000000\n", + "\n", + "[13661 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Count RTP
Name
Debra0.001260
Susan0.002034
Debbie0.002817
Cheryl0.003273
Carol0.003635
......
Jovi1.000000
Neta1.000000
Doni1.000000
Dondi1.000000
Kela1.000000
\n", + "

13661 rows × 1 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 52 + } + ], + "source": [ + "rtp_table.sort_values(\"Count RTP\")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "id": "pvGtucJEeUGz", + "outputId": "ea5e903b-8ab2-41e1-b8ef-10f4262331e6", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ], + "source": [ + "fig = px.line(babynames.query(\"Name == 'Debra' and Sex == 'F'\"), x = \"Year\", y = \"Count\")\n", + "fig.update_layout(font_size = 16)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "id": "xQwR_gVCeUGz", + "outputId": "c9f43937-1119-44fa-a4e9-44729c160459", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['Debra', 'Susan', 'Debbie', 'Cheryl', 'Carol', 'Tammy', 'Terri',\n", + " 'Shannon', 'Deborah', 'Carolyn'],\n", + " dtype='object', name='Name')" + ] + }, + "metadata": {}, + "execution_count": 54 + } + ], + "source": [ + "top10 = rtp_table.sort_values(\"Count RTP\").head(10).index\n", + "top10" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "id": "iigsDtB3eUG0", + "outputId": "c34d86bc-a6cd-45b8-c079-37c4744caa3d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ], + "source": [ + "fig = px.line(babynames.query(\"Name in @top10 and Sex == 'F'\"), x = \"Year\", y = \"Count\", color = \"Name\")\n", + "fig.update_layout(font_size = 13)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QQ1nsUDKeUG0" + }, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jHjJJqYEeUG0" + }, + "source": [ + "## Some Additional Groupby Puzzles" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o-ioLoGZeUG1" + }, + "source": [ + "Groupby puzzle #1: Try to create a groupby.agg call that gives the total babies born with each name." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "id": "uTAm4Kn1eUG1", + "outputId": "80f90c0a-a1e7-4cb9-e301-9826a32d9e72", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 450 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Count\n", + "Name \n", + "Aadhira 29\n", + "Aadhya 397\n", + "Aadya 251\n", + "Aahana 134\n", + "Aahna 7\n", + "... ...\n", + "Zyanya 186\n", + "Zyla 124\n", + "Zylah 109\n", + "Zyra 87\n", + "Zyrah 21\n", + "\n", + "[13661 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Count
Name
Aadhira29
Aadhya397
Aadya251
Aahana134
Aahna7
......
Zyanya186
Zyla124
Zylah109
Zyra87
Zyrah21
\n", + "

13661 rows × 1 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 56 + } + ], + "source": [ + "puzzle1 = female_babynames.groupby(\"Name\")[[\"Count\"]].agg(sum)\n", + "puzzle1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eyEaAiWaeUG1" + }, + "source": [ + "Groupby puzzle #2: Try to create a groupby.agg call that gives total babies born in each year." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "id": "ArWLhQFheUG1", + "outputId": "5e5bfcd4-e20b-46e5-b14f-bb3d0f5a4261", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 450 + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Count\n", + "Year \n", + "1910 9163\n", + "1911 9983\n", + "1912 17946\n", + "1913 22094\n", + "1914 26926\n", + "... ...\n", + "2017 410835\n", + "2018 395151\n", + "2019 386504\n", + "2020 362180\n", + "2021 359997\n", + "\n", + "[112 rows x 1 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Count
Year
19109163
19119983
191217946
191322094
191426926
......
2017410835
2018395151
2019386504
2020362180
2021359997
\n", + "

112 rows × 1 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 60 + } + ], + "source": [ + "puzzle2 = female_babynames.groupby(\"Year\")[[\"Count\"]].agg(sum) # all female babynames\n", + "puzzle2 = babynames.groupby(\"Year\")[[\"Count\"]].agg(sum) #both male and femaile babynames\n", + "\n", + "puzzle2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0WciSugSeUG2" + }, + "source": [ + "We can write this using a groupby shorthand aggregation method. Here, `sum()` is shorthand for `groupby.agg(sum)`." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "id": "uiE1DcO6eUG2" + }, + "outputs": [], + "source": [ + "puzzle2 = babynames.groupby(\"Year\")[[\"Count\"]].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "id": "bjcV97opeUG2", + "outputId": "eeb8d60c-99e8-4a44-f1ed-0244b77f58ae", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { "text/html": [ - "
\n", + "
" + " }) };
\n", + "\n", + "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -11167,16 +7547,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": { "id": "K4iRbh6yeUG4", - "outputId": "3e56a4b8-27d5-4919-b969-47761e293acb" + "outputId": "ca72f545-3cea-45fa-9ab9-b1c916ceb1d5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } }, "outputs": [ { + "output_type": "execute_result", "data": { + "text/plain": [ + " Year Candidate Party Popular vote Result %\n", + "137 1988 Michael Dukakis Democratic 41809074 loss 45.770691\n", + "172 2016 Darrell Castle Constitution 203091 loss 0.149640\n", + "28 1864 George B. McClellan Democratic 1812807 loss 45.048488\n", + "34 1876 Samuel J. Tilden Democratic 4288546 loss 51.528376\n", + "71 1916 Allan L. Benson Socialist 590524 loss 3.194193" + ], "text/html": [ - "
\n", + "\n", + "
\n", + "
\n", + "
\n", "\n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 66 + } + ], + "source": [ + "elections = pd.read_csv(\"https://raw.githubusercontent.com/archenclan/dsKal100/main/lec/lec04/elections.csv\")\n", + "elections.sample(5)" + ] + }, + { + "cell_type": "code", + "source": [ + "elections.query(\"Candidate == 'Woodrow Wilson'\")" + ], + "metadata": { + "id": "6Vk8X9QEltwH", + "outputId": "a8384009-3cf3-4482-db6c-a71a8accd004", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 111 + } + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Year Candidate Party Popular vote Result %\n", + "70 1912 Woodrow Wilson Democratic 6296284 win 41.933422\n", + "74 1916 Woodrow Wilson Democratic 9126868 win 49.367987" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -11240,35 +7798,98 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
YearCandidatePartyPopular voteResult%
70191241.933422
1131964Barry GoldwaterRepublican27175754loss38.655297741916Woodrow WilsonDemocratic9126868win49.367987
\n", - "
" - ], - "text/plain": [ - " Year Candidate Party Popular vote Result %\n", - "98 1944 Thomas E. Dewey Republican 22017929 loss 46.226199\n", - "34 1876 Samuel J. Tilden Democratic 4288546 loss 51.528376\n", - "42 1888 Alson Streeter Union Labor 146602 loss 1.288861\n", - "70 1912 Woodrow Wilson Democratic 6296284 win 41.933422\n", - "113 1964 Barry Goldwater Republican 27175754 loss 38.655297" + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " ] }, - "execution_count": 64, "metadata": {}, - "output_type": "execute_result" + "execution_count": 68 } - ], - "source": [ - "elections = pd.read_csv(\"elections.csv\")\n", - "elections.sample(5)" ] }, {