From e1daba24dbdbb42200e54721aaec4581e194c9bb Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Sun, 4 Feb 2024 13:35:26 +0000 Subject: [PATCH 01/16] updating feature store using aggregations --- financial_payment_classification_v2.ipynb | 1843 ++++++++++++--------- src/functions/serving.py | 4 +- utils.py | 46 + 3 files changed, 1107 insertions(+), 786 deletions(-) create mode 100644 utils.py diff --git a/financial_payment_classification_v2.ipynb b/financial_payment_classification_v2.ipynb index 92c89e7..391bacb 100644 --- a/financial_payment_classification_v2.ipynb +++ b/financial_payment_classification_v2.ipynb @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", "metadata": {}, "outputs": [], @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", "metadata": { "editable": true, @@ -108,13 +108,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-30 14:08:40,936 [warning] Could not detect path to API server, not connected to API server!\n", - "> 2024-01-30 14:08:40,937 [warning] MLRUN_DBPATH is misconfigured. Set this environment variable to the URL of the API server in order to connect\n", - "> 2024-01-30 14:08:40,962 [info] Identified pre-initialized git repo, using it: {'url': 'git://github.com/amit-elbaz/demo-sagemaker.git#refs/heads/development'}\n", - "> 2024-01-30 14:08:40,968 [warning] Could not detect path to API server, not connected to API server!\n", - "> 2024-01-30 14:08:40,968 [warning] MLRUN_DBPATH is misconfigured. Set this environment variable to the URL of the API server in order to connect\n", - "> 2024-01-30 14:08:40,969 [info] Created and saved project: {'name': 'sagemaker2-iguazio', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n", - "> 2024-01-30 14:08:40,971 [info] Project created successfully: {'project_name': 'sagemaker2', 'stored_in_db': True}\n" + "Project Source: git://github.com/mlrun/demo-sagemaker#development\n", + "> 2024-02-04 11:28:21,287 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" ] } ], @@ -139,10 +134,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "42c5d6d0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n" + ] + } + ], "source": [ "import boto3\n", "import io\n", @@ -159,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", "metadata": {}, "outputs": [], @@ -169,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", "metadata": {}, "outputs": [], @@ -189,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "c0e4db17", "metadata": {}, "outputs": [], @@ -214,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "43946b9f", "metadata": {}, "outputs": [], @@ -254,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "5ff0d280", "metadata": {}, "outputs": [], @@ -277,15 +281,213 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "a477abd7", "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\n", + " \"financial_transactions_mini.csv\",\n", + " parse_dates=[\"timestamp\"],\n", + " infer_datetime_format=True,\n", + " dtype={\"transaction_category\": \"string\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cf6be447", + "metadata": {}, + "source": [ + "The dataframe looks as follows:\n", + "\n", + "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", + "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", + "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", + "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", + "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", + "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", + "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", + "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", + "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", + "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", + "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", + "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" + ] + }, + { + "cell_type": "markdown", + "id": "b5492919", + "metadata": {}, + "source": [ + "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "metadata": {}, + "outputs": [], + "source": [ + "for key, val in factorize_key.items():\n", + " factorize_key[key] = str(val)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "[ 'Uncategorized', 'Entertainment', 'Education',\n", + " 'Shopping', 'Personal Care', 'Health and Fitness',\n", + " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", + " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", + " 'Fees and Charges', 'Business Services', 'Personal Services',\n", + " 'Taxes', 'Gambling', 'Home',\n", + " 'Pension and insurances']\n", + "Length: 19, dtype: string" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"transaction_category\"].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "f7314f8a", + "metadata": {}, + "source": [ + "We'll transform the transaction categories to numeric targets for the classification by factorization." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ea2ebdd5", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.\n" + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] }, { @@ -318,44 +520,44 @@ " \n", " \n", " \n", - " 0\n", - " Uncategorized\n", - " 4518551904499919\n", - " 4333582346477646\n", - " 833.26\n", - " 2021-03-10 19:57:42\n", + " 106\n", + " 0\n", + " 4.601853e+15\n", + " 4.274416e+15\n", + " 879.39\n", + " 2021-01-01 15:07:52\n", " \n", " \n", - " 1\n", - " Uncategorized\n", - " 4518551904499919\n", - " 4642413144038776\n", - " 596.63\n", - " 2021-02-11 17:53:32\n", + " 378\n", + " 0\n", + " 4.274544e+15\n", + " 4.366884e+15\n", + " 628.01\n", + " 2021-01-01 16:33:53\n", " \n", " \n", - " 2\n", - " Uncategorized\n", - " 4274544022939522\n", - " 4952665515556751\n", - " 176.76\n", - " 2021-02-21 18:29:32\n", + " 368\n", + " 0\n", + " 4.601853e+15\n", + " 4.161674e+15\n", + " 89.69\n", + " 2021-01-01 18:17:29\n", " \n", " \n", - " 3\n", - " Uncategorized\n", - " 4518551904499919\n", - " 4457298962882528\n", - " 879.78\n", - " 2021-04-09 16:14:19\n", + " 17\n", + " 0\n", + " 4.518552e+15\n", + " 4.619387e+15\n", + " 222.01\n", + " 2021-01-01 18:33:18\n", " \n", " \n", - " 4\n", - " Uncategorized\n", - " 4601853246125220\n", - " 4578126462896710\n", - " 742.25\n", - " 2021-04-04 15:50:16\n", + " 178\n", + " 0\n", + " 4.274544e+15\n", + " 4.456440e+15\n", + " 418.52\n", + " 2021-01-01 19:33:31\n", " \n", " \n", " ...\n", @@ -366,44 +568,44 @@ " ...\n", " \n", " \n", - " 99992\n", - " Pension and insurances\n", - " 4405008355220324\n", - " 4583355906735225\n", - " 205.43\n", - " 2021-04-20 12:23:53\n", + " 69938\n", + " 9\n", + " 4.904096e+15\n", + " 4.133603e+15\n", + " 124.08\n", + " 2024-02-02 15:00:00\n", " \n", " \n", - " 99993\n", - " Pension and insurances\n", - " 4300416744511335\n", - " 4949240916846171\n", - " 151.49\n", - " 2021-03-24 19:30:18\n", + " 70592\n", + " 9\n", + " 4.904096e+15\n", + " 4.444087e+15\n", + " 188.66\n", + " 2024-02-03 10:00:00\n", " \n", " \n", - " 99994\n", - " Pension and insurances\n", - " 4405008355220324\n", - " 4996896020767264\n", - " 188.28\n", - " 2021-03-08 19:51:10\n", + " 70379\n", + " 9\n", + " 4.200241e+15\n", + " 4.202495e+15\n", + " 139.27\n", + " 2024-02-03 15:00:00\n", " \n", " \n", - " 99995\n", - " Pension and insurances\n", - " 4262047194499006\n", - " 4017367486513464\n", - " 204.26\n", - " 2021-02-14 23:25:07\n", + " 70462\n", + " 9\n", + " 4.612985e+15\n", + " 4.525455e+15\n", + " 12.49\n", + " 2024-02-04 10:00:00\n", " \n", " \n", - " 99996\n", - " Pension and insurances\n", - " 4627516674144704\n", - " 4250420705087194\n", - " 207.92\n", - " 2021-04-14 00:42:00\n", + " 71672\n", + " 9\n", + " 4.538817e+15\n", + " 4.291294e+15\n", + " 57.03\n", + " 2024-02-04 15:00:00\n", " \n", " \n", "\n", @@ -411,279 +613,380 @@ "" ], "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "0 Uncategorized 4518551904499919 4333582346477646 833.26 \n", - "1 Uncategorized 4518551904499919 4642413144038776 596.63 \n", - "2 Uncategorized 4274544022939522 4952665515556751 176.76 \n", - "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "4 Uncategorized 4601853246125220 4578126462896710 742.25 \n", - "... ... ... ... ... \n", - "99992 Pension and insurances 4405008355220324 4583355906735225 205.43 \n", - "99993 Pension and insurances 4300416744511335 4949240916846171 151.49 \n", - "99994 Pension and insurances 4405008355220324 4996896020767264 188.28 \n", - "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", + " transaction_category receiver_id sender_id amount \\\n", + "106 0 4.601853e+15 4.274416e+15 879.39 \n", + "378 0 4.274544e+15 4.366884e+15 628.01 \n", + "368 0 4.601853e+15 4.161674e+15 89.69 \n", + "17 0 4.518552e+15 4.619387e+15 222.01 \n", + "178 0 4.274544e+15 4.456440e+15 418.52 \n", + "... ... ... ... ... \n", + "69938 9 4.904096e+15 4.133603e+15 124.08 \n", + "70592 9 4.904096e+15 4.444087e+15 188.66 \n", + "70379 9 4.200241e+15 4.202495e+15 139.27 \n", + "70462 9 4.612985e+15 4.525455e+15 12.49 \n", + "71672 9 4.538817e+15 4.291294e+15 57.03 \n", "\n", " timestamp \n", - "0 2021-03-10 19:57:42 \n", - "1 2021-02-11 17:53:32 \n", - "2 2021-02-21 18:29:32 \n", - "3 2021-04-09 16:14:19 \n", - "4 2021-04-04 15:50:16 \n", + "106 2021-01-01 15:07:52 \n", + "378 2021-01-01 16:33:53 \n", + "368 2021-01-01 18:17:29 \n", + "17 2021-01-01 18:33:18 \n", + "178 2021-01-01 19:33:31 \n", "... ... \n", - "99992 2021-04-20 12:23:53 \n", - "99993 2021-03-24 19:30:18 \n", - "99994 2021-03-08 19:51:10 \n", - "99995 2021-02-14 23:25:07 \n", - "99996 2021-04-14 00:42:00 \n", + "69938 2024-02-02 15:00:00 \n", + "70592 2024-02-03 10:00:00 \n", + "70379 2024-02-03 15:00:00 \n", + "70462 2024-02-04 10:00:00 \n", + "71672 2024-02-04 15:00:00 \n", "\n", "[99997 rows x 5 columns]" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" + "# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", + "from utils import update_timestamps\n", + "data = update_timestamps(data)\n", + "data" ] }, { "cell_type": "markdown", - "id": "cf6be447", - "metadata": {}, + "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", + "metadata": { + "tags": [] + }, "source": [ - "The dataframe looks as follows:\n", + "### 3. Create feature store \n", "\n", - "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", - "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", - "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", - "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", - "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", - "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", - "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", - "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", - "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", - "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", - "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", - "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" + "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " ] }, { "cell_type": "markdown", - "id": "b5492919", + "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", "metadata": {}, "source": [ - "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" + "#### feature-group-payment-classification" ] }, { "cell_type": "code", - "execution_count": null, - "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "execution_count": 21, + "id": "3c621044-681a-4e1a-9968-f637ed992539", "metadata": {}, "outputs": [], "source": [ - "for key, val in factorize_key.items():\n", - " factorize_key[key] = str(val)" + "# Function that gets a dataframe, creates daily dates and group by the date and the given column.\n", + "# It returns a dataframe where the index is daily dates, columns are the categories, and each value is the last average for that category for a given day\n", + "def get_last_transaction_avg_per_day(df, column_to_groupby):\n", + " df['date'] = pd.to_datetime(df['timestamp']).dt.date\n", + " df = df.groupby(['date', df.index])[column_to_groupby].last()\n", + " df = df.unstack(fill_value=0)\n", + " return df\n", + "\n", + "\n", + "# Function that gets a dataframe and calculates a moving average per category and the distance between the row's amount to each category average\n", + "def add_grouped_features(data):\n", + "\n", + " df_with_cat_avg = data\n", + " df_with_cat_avg.sort_values([\"transaction_category\", \"timestamp\"], inplace=True)\n", + " \n", + " # Convert the timestamp to daily date and remove the aggregated average\n", + " df_with_cat_avg['date'] = pd.to_datetime(df_with_cat_avg['timestamp']).dt.date\n", + " df_without_cat_avg = df_with_cat_avg.drop(\"amount_avg_1d\", axis=1)\n", + " \n", + " # Get the daily average per transaction category\n", + " df_with_all_cat_avg = get_last_transaction_avg_per_day(df_with_cat_avg, 'amount_avg_1d')\n", + " \n", + " # Now let's join the 2 dataframes + calculate distance from average\n", + " unique_categories = df_without_cat_avg.index.unique()\n", + " df_without_cat_avg = df_without_cat_avg.reset_index()\n", + " \n", + " # Join the 2 dataframes\n", + " df_merged = pd.merge(df_without_cat_avg, df_with_all_cat_avg, on='date', how='outer')\n", + "\n", + " # For each transaction_category, calculate the distance and remove the category column\n", + " for col in unique_categories:\n", + " df_merged[\"dist_\" + col] = abs(df_merged[col] - df_merged[\"amount\"])\n", + " df_merged.drop(col, axis=1, inplace=True)\n", + " \n", + " # Split the timestamp into components\n", + " df_merged[\"year\"] = df_merged[\"timestamp\"].dt.year\n", + " df_merged[\"month\"] = df_merged[\"timestamp\"].dt.month\n", + " df_merged[\"day\"] = df_merged[\"timestamp\"].dt.day\n", + " df_merged[\"hour\"] = df_merged[\"timestamp\"].dt.hour\n", + " df_merged[\"minute\"] = df_merged[\"timestamp\"].dt.minute\n", + " df_merged[\"second\"] = df_merged[\"timestamp\"].dt.second\n", + "\n", + " del df_merged[\"timestamp\"]\n", + " del df_merged[\"date\"] \n", + " df_merged['transaction_id']= df_merged.reset_index().index \n", + " \n", + " return df_merged" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", + "execution_count": 16, + "id": "07fdb07a-f3b7-4255-b38b-17a939b8676d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "Aggregates\n", + "\n", + "Aggregates\n", + "\n", + "\n", + "\n", + "_start->Aggregates\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fstore\n", + "\n", + "# Create a feature set with the moving average (daily window\n", + "fset = fstore.FeatureSet(\n", + " \"aggregations\",\n", + " entities=[mlrun.features.Entity(\"transaction_category\")],\n", + " timestamp_key=\"timestamp\"\n", + ")\n", + "fset.add_aggregation(\"amount\", [\"avg\"], \"1d\")\n", + "fset.set_targets()\n", + "fset.graph.plot()\n", + " \n", + "# Ingest the data (will perform the aggregation)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e213e91e-276a-4cde-a2fc-059369cc837a", + "metadata": {}, + "outputs": [], + "source": [ + "df_with_cat_avg = fset.ingest(data, return_df=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "19081c06-240e-481b-bfe3-588bb77bd54e", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amount_avg_1dreceiver_idsender_idamounttimestamp
transaction_category
0879.3900004.601853e+154.274416e+15879.392021-01-01 15:07:52
0753.7000004.274544e+154.366884e+15628.012021-01-01 16:33:53
0532.3633334.601853e+154.161674e+1589.692021-01-01 18:17:29
0454.7750004.518552e+154.619387e+15222.012021-01-01 18:33:18
0447.5240004.274544e+154.456440e+15418.522021-01-01 19:33:31
..................
9126.3550004.904096e+154.133603e+15124.082024-02-02 15:00:00
9188.6600004.904096e+154.444087e+15188.662024-02-03 10:00:00
9163.9650004.200241e+154.202495e+15139.272024-02-03 15:00:00
912.4900004.612985e+154.525455e+1512.492024-02-04 10:00:00
934.7600004.538817e+154.291294e+1557.032024-02-04 15:00:00
\n", + "

99997 rows × 5 columns

\n", + "
" + ], "text/plain": [ - "\n", - "[ 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances']\n", - "Length: 19, dtype: string" + " amount_avg_1d receiver_id sender_id amount \\\n", + "transaction_category \n", + "0 879.390000 4.601853e+15 4.274416e+15 879.39 \n", + "0 753.700000 4.274544e+15 4.366884e+15 628.01 \n", + "0 532.363333 4.601853e+15 4.161674e+15 89.69 \n", + "0 454.775000 4.518552e+15 4.619387e+15 222.01 \n", + "0 447.524000 4.274544e+15 4.456440e+15 418.52 \n", + "... ... ... ... ... \n", + "9 126.355000 4.904096e+15 4.133603e+15 124.08 \n", + "9 188.660000 4.904096e+15 4.444087e+15 188.66 \n", + "9 163.965000 4.200241e+15 4.202495e+15 139.27 \n", + "9 12.490000 4.612985e+15 4.525455e+15 12.49 \n", + "9 34.760000 4.538817e+15 4.291294e+15 57.03 \n", + "\n", + " timestamp \n", + "transaction_category \n", + "0 2021-01-01 15:07:52 \n", + "0 2021-01-01 16:33:53 \n", + "0 2021-01-01 18:17:29 \n", + "0 2021-01-01 18:33:18 \n", + "0 2021-01-01 19:33:31 \n", + "... ... \n", + "9 2024-02-02 15:00:00 \n", + "9 2024-02-03 10:00:00 \n", + "9 2024-02-03 15:00:00 \n", + "9 2024-02-04 10:00:00 \n", + "9 2024-02-04 15:00:00 \n", + "\n", + "[99997 rows x 5 columns]" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data[\"transaction_category\"].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "f7314f8a", - "metadata": {}, - "source": [ - "We'll transform the transaction categories to numeric targets for the classification by factorization." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea2ebdd5", - "metadata": {}, - "outputs": [], - "source": [ - "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" - ] - }, - { - "cell_type": "markdown", - "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", - "metadata": { - "tags": [] - }, - "source": [ - "### 3. Create feature store \n", - "\n", - "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " - ] - }, - { - "cell_type": "markdown", - "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", - "metadata": {}, - "source": [ - "#### feature-group-payment-classification" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "3c621044-681a-4e1a-9968-f637ed992539", - "metadata": {}, - "outputs": [], - "source": [ - "# Function that gets a dataframe, creates daily dates and group by the date and the given column.\n", - "# It returns a dataframe where the index is daily dates, columns are the categories, and each value is the last average for that category for a given day\n", - "def get_last_transaction_avg_per_day(df, column_to_groupby):\n", - " df['date'] = pd.to_datetime(df['timestamp']).dt.date\n", - " df = df.groupby(['date', df.index])[column_to_groupby].last()\n", - " df = df.unstack(fill_value=0)\n", - " return df\n", - "\n", - "\n", - "# Creates a feature set with a daily average aggregation on the amount and ingest the given dataframe in it\n", - "def create_aggregations(data):\n", - " data.sort_values([\"transaction_category\", \"timestamp\"], inplace=True)\n", - " \n", - " # Create a feature set with the moving average (daily window\n", - " fset = fstore.FeatureSet(\n", - " \"aggregations\",\n", - " entities=[mlrun.features.Entity(\"transaction_category\")],\n", - " timestamp_key=\"timestamp\"\n", - " )\n", - " fset.add_aggregation(\"amount\", [\"avg\"], \"1d\")\n", - " fset.set_targets()\n", - " fset.graph.plot()\n", - " \n", - " # Ingest the data (will perform the aggregation)\n", - " df_with_cat_avg = fset.ingest(data, return_df=True)\n", - " return df_with_cat_avg\n", - "\n", - "# Function that gets a dataframe and calculates a moving average per category and the distance between the row's amount to each category average\n", - "def add_grouped_features(data):\n", - "\n", - " df_with_cat_avg = data\n", - " df_with_cat_avg.sort_values([\"transaction_category\", \"timestamp\"], inplace=True)\n", - " \n", - " # Convert the timestamp to daily date and remove the aggregated average\n", - " df_with_cat_avg['date'] = pd.to_datetime(df_with_cat_avg['timestamp']).dt.date\n", - " df_without_cat_avg = df_with_cat_avg.drop(\"amount_avg_1d\", axis=1)\n", - " \n", - " # Get the daily average per transaction category\n", - " df_with_all_cat_avg = get_last_transaction_avg_per_day(df_with_cat_avg, 'amount_avg_1d')\n", - " \n", - " # Now let's join the 2 dataframes + calculate distance from average\n", - " unique_categories = df_without_cat_avg.index.unique()\n", - " df_without_cat_avg = df_without_cat_avg.reset_index()\n", - " \n", - " # Join the 2 dataframes\n", - " df_merged = pd.merge(df_without_cat_avg, df_with_all_cat_avg, on='date', how='outer')\n", - "\n", - " # For each transaction_category, calculate the distance and remove the category column\n", - " for col in unique_categories:\n", - " df_merged[\"dist_\" + col] = abs(df_merged[col] - df_merged[\"amount\"])\n", - " df_merged.drop(col, axis=1, inplace=True)\n", - " \n", - " # Split the timestamp into components\n", - " df_merged[\"year\"] = df_merged[\"timestamp\"].dt.year\n", - " df_merged[\"month\"] = df_merged[\"timestamp\"].dt.month\n", - " df_merged[\"day\"] = df_merged[\"timestamp\"].dt.day\n", - " df_merged[\"hour\"] = df_merged[\"timestamp\"].dt.hour\n", - " df_merged[\"minute\"] = df_merged[\"timestamp\"].dt.minute\n", - " df_merged[\"second\"] = df_merged[\"timestamp\"].dt.second\n", - "\n", - " del df_merged[\"timestamp\"]\n", - " del df_merged[\"date\"] \n", - " \n", - " return df_merged\n", - "\n", - "\n", - "\n", - "\n", - "# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", - "def update_timestamps(data):\n", - "\n", - " # Get today's date\n", - " today = datetime.today()\n", - "\n", - " # Calculate the dates for the last 5 days\n", - " last_5_days = [today - timedelta(days=i) for i in range(4, -1, -1)] # Reverse for chronological order\n", - "\n", - " # Extract year, month, and day from each date object\n", - " years = [d.year for d in last_5_days]\n", - " months = [d.month for d in last_5_days]\n", - " days = [d.day for d in last_5_days]\n", - "\n", - " hours = [10, 15]\n", - "\n", - " # Create a list of timestamps of the last 5 days, 2 timestamps per day.\n", - " times = []\n", - " for year, month, day in zip(years, months, days):\n", - " for hour in hours:\n", - " times.append(datetime(year, month, day, hour))\n", - "\n", - "\n", - " # Iterate over each transaction category\n", - " for i in range(len(data[\"transaction_category\"].unique())):\n", - " # Extract all the rows for each category\n", - " category_data = data[data['transaction_category'] == str(i)]\n", - "\n", - " # Ensure timestamp is a datetime object\n", - " category_data['timestamp'] = pd.to_datetime(category_data['timestamp'])\n", - "\n", - " # Sort DataFrame by timestamp in descending order\n", - " category_data_sorted = category_data.sort_values(by='timestamp', ascending=False)\n", - "\n", - " # Select the latest rows and update their timestamp\n", - " latest_rows = category_data_sorted.head(len(times))\n", - " latest_rows['timestamp'] = times\n", - "\n", - " # Update the initial dataframe to include those updated rows\n", - " data.update(latest_rows)\n", - " \n", - " return data" + "df_with_cat_avg" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "c71af4a9-f2d8-40ca-b0bf-3ef67c5b69d9", + "execution_count": 19, + "id": "8993721e-f0e5-4438-ab55-2f9bfb78e20a", "metadata": {}, "outputs": [ { @@ -692,73 +995,72 @@ "\n", "\n", - "\n", "\n", - "\n", + "\n", "\n", "mlrun-flow\n", - "\n", + "\n", "\n", "\n", "_start\n", - "\n", - "start\n", + "\n", + "start\n", "\n", "\n", "\n", "add_grouped_features\n", - "\n", - "add_grouped_features\n", + "\n", + "add_grouped_features\n", "\n", "\n", "\n", "_start->add_grouped_features\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "parquet/parquet\n", - "\n", - "\n", - "parquet\n", + "\n", + "\n", + "parquet\n", "\n", "\n", "\n", "add_grouped_features->parquet/parquet\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "nosql/nosql\n", - "\n", - "\n", - "nosql\n", + "\n", + "\n", + "nosql\n", "\n", "\n", "\n", "add_grouped_features->nosql/nosql\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import ParquetTarget\n", "\n", "# creating feature set\n", "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", @@ -769,10 +1071,7 @@ "# setting up the graph\n", "# setting up the graph\n", "extended_transactions_set.graph \\\n", - " .to(name='update_timestamps', handler='update_timestamps').to(name='create_aggregations', handler='create_aggregations').to(name=\"add_grouped_features\", handler=\"add_grouped_features\")\n", - " # Add aggregations for 2, 12, and 24 hour time windows\n", - " \n", - " \n", + " .to(name=\"add_grouped_features\", handler=\"add_grouped_features\")\n", "\n", "\n", "extended_transactions_set.set_targets()\n", @@ -782,7 +1081,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", "metadata": {}, "outputs": [ @@ -790,7 +1089,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:11:30,483 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + "> 2024-02-04 12:59:20,537 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" ] }, { @@ -818,22 +1117,22 @@ " receiver_id\n", " sender_id\n", " amount\n", + " dist_0\n", + " dist_1\n", + " dist_10\n", + " dist_11\n", + " dist_12\n", + " dist_13\n", + " ...\n", + " dist_7\n", + " dist_8\n", + " dist_9\n", " year\n", " month\n", " day\n", " hour\n", " minute\n", " second\n", - " ...\n", - " dist_18\n", - " dist_2\n", - " dist_3\n", - " dist_4\n", - " dist_5\n", - " dist_6\n", - " dist_7\n", - " dist_8\n", - " dist_9\n", " transaction_id\n", " \n", " \n", @@ -841,121 +1140,121 @@ "
\n", " 0\n", " 0\n", - " 4.518552e+15\n", - " 4.333582e+15\n", - " 833.26\n", - " 2021.0\n", - " 3.0\n", - " 10.0\n", - " 19.0\n", - " 57.0\n", - " 42.0\n", + " 4.601853e+15\n", + " 4.274416e+15\n", + " 879.39\n", + " 490.66\n", + " 830.437308\n", + " 776.697953\n", + " 557.02625\n", + " 849.33375\n", + " 680.511538\n", " ...\n", - " 627.802849\n", - " 17.893495\n", - " 732.342497\n", - " 801.755964\n", - " 713.663595\n", - " 740.010607\n", - " 782.187553\n", - " 5191.287484\n", - " 718.480442\n", + " 842.694545\n", + " 6064.542857\n", + " 732.001765\n", + " 2021\n", + " 1\n", + " 1\n", + " 15\n", + " 7\n", + " 52\n", " 0\n", "
\n", "
\n", " 1\n", " 0\n", - " 4.518552e+15\n", - " 4.642413e+15\n", - " 596.63\n", - " 2021.0\n", - " 2.0\n", - " 11.0\n", - " 17.0\n", - " 53.0\n", - " 32.0\n", + " 4.274544e+15\n", + " 4.366884e+15\n", + " 628.01\n", + " 239.28\n", + " 579.057308\n", + " 525.317953\n", + " 305.64625\n", + " 597.95375\n", + " 429.131538\n", " ...\n", - " 391.172849\n", - " 254.523495\n", - " 495.712497\n", - " 565.125964\n", - " 477.033595\n", - " 503.380607\n", - " 545.557553\n", - " 5427.917484\n", - " 481.850442\n", + " 591.314545\n", + " 6315.922857\n", + " 480.621765\n", + " 2021\n", + " 1\n", + " 1\n", + " 16\n", + " 33\n", + " 53\n", " 1\n", "
\n", "
\n", " 2\n", " 0\n", - " 4.274544e+15\n", - " 4.952666e+15\n", - " 176.76\n", - " 2021.0\n", - " 2.0\n", - " 21.0\n", - " 18.0\n", - " 29.0\n", - " 32.0\n", + " 4.601853e+15\n", + " 4.161674e+15\n", + " 89.69\n", + " 299.04\n", + " 40.737308\n", + " 13.002047\n", + " 232.67375\n", + " 59.63375\n", + " 109.188462\n", " ...\n", - " 28.697151\n", - " 674.393495\n", - " 75.842497\n", - " 145.255964\n", - " 57.163595\n", - " 83.510607\n", - " 125.687553\n", - " 5847.787484\n", - " 61.980442\n", + " 52.994545\n", + " 6854.242857\n", + " 57.698235\n", + " 2021\n", + " 1\n", + " 1\n", + " 18\n", + " 17\n", + " 29\n", " 2\n", "
\n", "
\n", " 3\n", " 0\n", " 4.518552e+15\n", - " 4.457299e+15\n", - " 879.78\n", - " 2021.0\n", - " 4.0\n", - " 9.0\n", - " 16.0\n", - " 14.0\n", - " 19.0\n", + " 4.619387e+15\n", + " 222.01\n", + " 166.72\n", + " 173.057308\n", + " 119.317953\n", + " 100.35375\n", + " 191.95375\n", + " 23.131538\n", " ...\n", - " 674.322849\n", - " 28.626505\n", - " 778.862497\n", - " 848.275964\n", - " 760.183595\n", - " 786.530607\n", - " 828.707553\n", - " 5144.767484\n", - " 765.000442\n", + " 185.314545\n", + " 6721.922857\n", + " 74.621765\n", + " 2021\n", + " 1\n", + " 1\n", + " 18\n", + " 33\n", + " 18\n", " 3\n", "
\n", "
\n", " 4\n", " 0\n", - " 4.601853e+15\n", - " 4.578126e+15\n", - " 742.25\n", - " 2021.0\n", - " 4.0\n", - " 4.0\n", - " 15.0\n", - " 50.0\n", - " 16.0\n", + " 4.274544e+15\n", + " 4.456440e+15\n", + " 418.52\n", + " 29.79\n", + " 369.567308\n", + " 315.827953\n", + " 96.15625\n", + " 388.46375\n", + " 219.641538\n", " ...\n", - " 536.792849\n", - " 108.903495\n", - " 641.332497\n", - " 710.745964\n", - " 622.653595\n", - " 649.000607\n", - " 691.177553\n", - " 5282.297484\n", - " 627.470442\n", + " 381.824545\n", + " 6525.412857\n", + " 271.131765\n", + " 2021\n", + " 1\n", + " 1\n", + " 19\n", + " 33\n", + " 31\n", " 4\n", "
\n", "
\n", @@ -984,122 +1283,122 @@ "
\n", "
\n", " 99992\n", - " 18\n", - " 4.405008e+15\n", - " 4.583356e+15\n", - " 205.43\n", - " 2021.0\n", - " 4.0\n", - " 20.0\n", - " 12.0\n", - " 23.0\n", - " 53.0\n", + " 9\n", + " 4.735688e+15\n", + " 4.925043e+15\n", + " 188.77\n", + " 188.77\n", + " 140.164948\n", + " 92.698188\n", + " 153.58500\n", + " 188.77000\n", + " 2.936667\n", " ...\n", - " 0.027151\n", - " 645.723495\n", - " 104.512497\n", - " 173.925964\n", - " 85.833595\n", - " 112.180607\n", - " 154.357553\n", - " 5819.117484\n", - " 90.650442\n", + " 145.000476\n", + " 188.770000\n", + " 51.244583\n", + " 2021\n", + " 4\n", + " 29\n", + " 16\n", + " 6\n", + " 27\n", " 99992\n", "
\n", "
\n", " 99993\n", - " 18\n", - " 4.300417e+15\n", - " 4.949241e+15\n", - " 151.49\n", - " 2021.0\n", - " 3.0\n", - " 24.0\n", - " 19.0\n", - " 30.0\n", - " 18.0\n", + " 9\n", + " 4.419127e+15\n", + " 4.035793e+15\n", + " 130.45\n", + " 130.45\n", + " 81.844948\n", + " 34.378188\n", + " 211.90500\n", + " 130.45000\n", + " 61.256667\n", " ...\n", - " 53.967151\n", - " 699.663495\n", - " 50.572497\n", - " 119.985964\n", - " 31.893595\n", - " 58.240607\n", - " 100.417553\n", - " 5873.057484\n", - " 36.710442\n", + " 86.680476\n", + " 130.450000\n", + " 7.075417\n", + " 2021\n", + " 4\n", + " 29\n", + " 16\n", + " 53\n", + " 31\n", " 99993\n", "
\n", "
\n", " 99994\n", - " 18\n", - " 4.405008e+15\n", - " 4.996896e+15\n", - " 188.28\n", - " 2021.0\n", - " 3.0\n", - " 8.0\n", - " 19.0\n", - " 51.0\n", - " 10.0\n", + " 9\n", + " 4.885580e+15\n", + " 4.892613e+15\n", + " 249.82\n", + " 249.82\n", + " 201.214948\n", + " 153.748188\n", + " 92.53500\n", + " 249.82000\n", + " 58.113333\n", " ...\n", - " 17.177151\n", - " 662.873495\n", - " 87.362497\n", - " 156.775964\n", - " 68.683595\n", - " 95.030607\n", - " 137.207553\n", - " 5836.267484\n", - " 73.500442\n", + " 206.050476\n", + " 249.820000\n", + " 112.294583\n", + " 2021\n", + " 4\n", + " 29\n", + " 16\n", + " 57\n", + " 57\n", " 99994\n", "
\n", "
\n", " 99995\n", - " 18\n", - " 4.262047e+15\n", - " 4.017367e+15\n", - " 204.26\n", - " 2021.0\n", - " 2.0\n", - " 14.0\n", - " 23.0\n", - " 25.0\n", - " 7.0\n", + " 9\n", + " 4.538817e+15\n", + " 4.853749e+15\n", + " 130.17\n", + " 130.17\n", + " 81.564948\n", + " 34.098188\n", + " 212.18500\n", + " 130.17000\n", + " 61.536667\n", " ...\n", - " 1.197151\n", - " 646.893495\n", - " 103.342497\n", - " 172.755964\n", - " 84.663595\n", - " 111.010607\n", - " 153.187553\n", - " 5820.287484\n", - " 89.480442\n", + " 86.400476\n", + " 130.170000\n", + " 7.355417\n", + " 2021\n", + " 4\n", + " 29\n", + " 17\n", + " 4\n", + " 4\n", " 99995\n", "
\n", "
\n", " 99996\n", - " 18\n", - " 4.627517e+15\n", - " 4.250421e+15\n", - " 207.92\n", - " 2021.0\n", - " 4.0\n", - " 14.0\n", - " 0.0\n", - " 42.0\n", - " 0.0\n", + " 9\n", + " 4.871261e+15\n", + " 4.625081e+15\n", + " 135.34\n", + " 135.34\n", + " 86.734948\n", + " 39.268188\n", + " 207.01500\n", + " 135.34000\n", + " 56.366667\n", " ...\n", - " 2.462849\n", - " 643.233495\n", - " 107.002497\n", - " 176.415964\n", - " 88.323595\n", - " 114.670607\n", - " 156.847553\n", - " 5816.627484\n", - " 93.140442\n", + " 91.570476\n", + " 135.340000\n", + " 2.185417\n", + " 2021\n", + " 4\n", + " 29\n", + " 17\n", + " 10\n", + " 21\n", " 99996\n", "
\n", " \n", @@ -1108,69 +1407,69 @@ "" ], "text/plain": [ - " transaction_category receiver_id sender_id amount year month \\\n", - "0 0 4.518552e+15 4.333582e+15 833.26 2021.0 3.0 \n", - "1 0 4.518552e+15 4.642413e+15 596.63 2021.0 2.0 \n", - "2 0 4.274544e+15 4.952666e+15 176.76 2021.0 2.0 \n", - "3 0 4.518552e+15 4.457299e+15 879.78 2021.0 4.0 \n", - "4 0 4.601853e+15 4.578126e+15 742.25 2021.0 4.0 \n", - "... ... ... ... ... ... ... \n", - "99992 18 4.405008e+15 4.583356e+15 205.43 2021.0 4.0 \n", - "99993 18 4.300417e+15 4.949241e+15 151.49 2021.0 3.0 \n", - "99994 18 4.405008e+15 4.996896e+15 188.28 2021.0 3.0 \n", - "99995 18 4.262047e+15 4.017367e+15 204.26 2021.0 2.0 \n", - "99996 18 4.627517e+15 4.250421e+15 207.92 2021.0 4.0 \n", + " transaction_category receiver_id sender_id amount dist_0 \\\n", + "0 0 4.601853e+15 4.274416e+15 879.39 490.66 \n", + "1 0 4.274544e+15 4.366884e+15 628.01 239.28 \n", + "2 0 4.601853e+15 4.161674e+15 89.69 299.04 \n", + "3 0 4.518552e+15 4.619387e+15 222.01 166.72 \n", + "4 0 4.274544e+15 4.456440e+15 418.52 29.79 \n", + "... ... ... ... ... ... \n", + "99992 9 4.735688e+15 4.925043e+15 188.77 188.77 \n", + "99993 9 4.419127e+15 4.035793e+15 130.45 130.45 \n", + "99994 9 4.885580e+15 4.892613e+15 249.82 249.82 \n", + "99995 9 4.538817e+15 4.853749e+15 130.17 130.17 \n", + "99996 9 4.871261e+15 4.625081e+15 135.34 135.34 \n", "\n", - " day hour minute second ... dist_18 dist_2 dist_3 \\\n", - "0 10.0 19.0 57.0 42.0 ... 627.802849 17.893495 732.342497 \n", - "1 11.0 17.0 53.0 32.0 ... 391.172849 254.523495 495.712497 \n", - "2 21.0 18.0 29.0 32.0 ... 28.697151 674.393495 75.842497 \n", - "3 9.0 16.0 14.0 19.0 ... 674.322849 28.626505 778.862497 \n", - "4 4.0 15.0 50.0 16.0 ... 536.792849 108.903495 641.332497 \n", - "... ... ... ... ... ... ... ... ... \n", - "99992 20.0 12.0 23.0 53.0 ... 0.027151 645.723495 104.512497 \n", - "99993 24.0 19.0 30.0 18.0 ... 53.967151 699.663495 50.572497 \n", - "99994 8.0 19.0 51.0 10.0 ... 17.177151 662.873495 87.362497 \n", - "99995 14.0 23.0 25.0 7.0 ... 1.197151 646.893495 103.342497 \n", - "99996 14.0 0.0 42.0 0.0 ... 2.462849 643.233495 107.002497 \n", + " dist_1 dist_10 dist_11 dist_12 dist_13 ... \\\n", + "0 830.437308 776.697953 557.02625 849.33375 680.511538 ... \n", + "1 579.057308 525.317953 305.64625 597.95375 429.131538 ... \n", + "2 40.737308 13.002047 232.67375 59.63375 109.188462 ... \n", + "3 173.057308 119.317953 100.35375 191.95375 23.131538 ... \n", + "4 369.567308 315.827953 96.15625 388.46375 219.641538 ... \n", + "... ... ... ... ... ... ... \n", + "99992 140.164948 92.698188 153.58500 188.77000 2.936667 ... \n", + "99993 81.844948 34.378188 211.90500 130.45000 61.256667 ... \n", + "99994 201.214948 153.748188 92.53500 249.82000 58.113333 ... \n", + "99995 81.564948 34.098188 212.18500 130.17000 61.536667 ... \n", + "99996 86.734948 39.268188 207.01500 135.34000 56.366667 ... \n", "\n", - " dist_4 dist_5 dist_6 dist_7 dist_8 \\\n", - "0 801.755964 713.663595 740.010607 782.187553 5191.287484 \n", - "1 565.125964 477.033595 503.380607 545.557553 5427.917484 \n", - "2 145.255964 57.163595 83.510607 125.687553 5847.787484 \n", - "3 848.275964 760.183595 786.530607 828.707553 5144.767484 \n", - "4 710.745964 622.653595 649.000607 691.177553 5282.297484 \n", - "... ... ... ... ... ... \n", - "99992 173.925964 85.833595 112.180607 154.357553 5819.117484 \n", - "99993 119.985964 31.893595 58.240607 100.417553 5873.057484 \n", - "99994 156.775964 68.683595 95.030607 137.207553 5836.267484 \n", - "99995 172.755964 84.663595 111.010607 153.187553 5820.287484 \n", - "99996 176.415964 88.323595 114.670607 156.847553 5816.627484 \n", + " dist_7 dist_8 dist_9 year month day hour minute \\\n", + "0 842.694545 6064.542857 732.001765 2021 1 1 15 7 \n", + "1 591.314545 6315.922857 480.621765 2021 1 1 16 33 \n", + "2 52.994545 6854.242857 57.698235 2021 1 1 18 17 \n", + "3 185.314545 6721.922857 74.621765 2021 1 1 18 33 \n", + "4 381.824545 6525.412857 271.131765 2021 1 1 19 33 \n", + "... ... ... ... ... ... ... ... ... \n", + "99992 145.000476 188.770000 51.244583 2021 4 29 16 6 \n", + "99993 86.680476 130.450000 7.075417 2021 4 29 16 53 \n", + "99994 206.050476 249.820000 112.294583 2021 4 29 16 57 \n", + "99995 86.400476 130.170000 7.355417 2021 4 29 17 4 \n", + "99996 91.570476 135.340000 2.185417 2021 4 29 17 10 \n", "\n", - " dist_9 transaction_id \n", - "0 718.480442 0 \n", - "1 481.850442 1 \n", - "2 61.980442 2 \n", - "3 765.000442 3 \n", - "4 627.470442 4 \n", - "... ... ... \n", - "99992 90.650442 99992 \n", - "99993 36.710442 99993 \n", - "99994 73.500442 99994 \n", - "99995 89.480442 99995 \n", - "99996 93.140442 99996 \n", + " second transaction_id \n", + "0 52 0 \n", + "1 53 1 \n", + "2 29 2 \n", + "3 18 3 \n", + "4 31 4 \n", + "... ... ... \n", + "99992 27 99992 \n", + "99993 31 99993 \n", + "99994 57 99994 \n", + "99995 4 99995 \n", + "99996 21 99996 \n", "\n", "[99997 rows x 30 columns]" ] }, - "execution_count": 17, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import mlrun.feature_store as fstore\n", - "data = extended_transactions_set.ingest(data, overwrite=True)\n", + "data = extended_transactions_set.ingest(df_with_cat_avg, overwrite=True)\n", "data" ] }, @@ -1213,7 +1512,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "id": "bb4bdd8d", "metadata": {}, "outputs": [], @@ -1234,7 +1533,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 24, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -1254,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 25, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -1280,7 +1579,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -1298,7 +1597,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1321,7 +1620,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 28, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1346,7 +1645,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1374,7 +1673,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 30, "id": "c24e06fc", "metadata": { "scrolled": true @@ -1384,143 +1683,143 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-25-14-12-01-149\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-04-12-59-56-016\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-01-25 14:12:01 Starting - Starting the training job...\n", - "2024-01-25 14:12:18 Starting - Preparing the instances for training.........\n", - "2024-01-25 14:13:58 Downloading - Downloading input data......\n", - "2024-01-25 14:14:34 Downloading - Downloading the training image...\n", - "2024-01-25 14:15:29 Training - Training image download completed. Training in progress...\u001b[34m[2024-01-25 14:15:41.041 ip-10-2-106-129.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-04 12:59:56 Starting - Starting the training job......\n", + "2024-02-04 13:00:31 Starting - Preparing the instances for training......\n", + "2024-02-04 13:01:31 Downloading - Downloading input data...\n", + "2024-02-04 13:02:01 Downloading - Downloading the training image...\n", + "2024-02-04 13:02:51 Training - Training image download completed. Training in progress....\u001b[34m[2024-02-04 13:03:08.197 ip-10-0-241-146.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.342 ip-10-2-106-129.ec2.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.00047#011validation-merror:0.00050\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:42.380 ip-10-2-106-129.ec2.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:42.383 ip-10-2-106-129.ec2.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.00023#011validation-merror:0.00040\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:08.488 ip-10-0-241-146.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:08.489 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:08.489 ip-10-0-241-146.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-04:13:03:08:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:08.490 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:08.490 ip-10-0-241-146.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.54515#011validation-merror:0.55430\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:10.570 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-04 13:03:10.575 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.53387#011validation-merror:0.54255\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.52198#011validation-merror:0.53050\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.51036#011validation-merror:0.52010\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.49936#011validation-merror:0.51095\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.49232#011validation-merror:0.50425\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.48936#011validation-merror:0.50210\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.48521#011validation-merror:0.49810\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.48034#011validation-merror:0.49275\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.47621#011validation-merror:0.48995\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.47151#011validation-merror:0.48500\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.46211#011validation-merror:0.47540\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.45770#011validation-merror:0.47160\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.45441#011validation-merror:0.46720\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.45021#011validation-merror:0.46235\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.44288#011validation-merror:0.45495\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.43809#011validation-merror:0.45070\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.43083#011validation-merror:0.44490\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.42683#011validation-merror:0.44065\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.41773#011validation-merror:0.43280\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.41412#011validation-merror:0.42900\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.40940#011validation-merror:0.42570\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.40558#011validation-merror:0.42220\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.40010#011validation-merror:0.41570\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.39509#011validation-merror:0.41130\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.39215#011validation-merror:0.40905\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.38077#011validation-merror:0.39840\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.37355#011validation-merror:0.39080\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.36949#011validation-merror:0.38705\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.36450#011validation-merror:0.38150\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.35094#011validation-merror:0.36650\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.34519#011validation-merror:0.35935\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.34140#011validation-merror:0.35690\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.33711#011validation-merror:0.35250\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.33434#011validation-merror:0.34945\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.32674#011validation-merror:0.34200\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.32153#011validation-merror:0.33760\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.31661#011validation-merror:0.33145\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.31099#011validation-merror:0.32515\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.30624#011validation-merror:0.31980\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.29983#011validation-merror:0.31405\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.29713#011validation-merror:0.31095\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.29291#011validation-merror:0.30635\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.28383#011validation-merror:0.29810\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.27650#011validation-merror:0.29000\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.26714#011validation-merror:0.27965\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.26255#011validation-merror:0.27560\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.25953#011validation-merror:0.27225\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.25214#011validation-merror:0.26700\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.24840#011validation-merror:0.26255\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.24452#011validation-merror:0.25870\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.24284#011validation-merror:0.25745\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.23650#011validation-merror:0.25165\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.23110#011validation-merror:0.24585\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.22725#011validation-merror:0.24250\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.22378#011validation-merror:0.23920\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.22085#011validation-merror:0.23610\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.21870#011validation-merror:0.23400\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.21478#011validation-merror:0.23010\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.21048#011validation-merror:0.22455\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.20771#011validation-merror:0.22170\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.20252#011validation-merror:0.21665\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.19939#011validation-merror:0.21390\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.19668#011validation-merror:0.21095\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.19499#011validation-merror:0.20855\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.18875#011validation-merror:0.20205\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.18505#011validation-merror:0.19835\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.18059#011validation-merror:0.19375\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.17695#011validation-merror:0.19005\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.17485#011validation-merror:0.18875\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.17348#011validation-merror:0.18735\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.17201#011validation-merror:0.18580\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.16625#011validation-merror:0.18010\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.16215#011validation-merror:0.17615\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.15796#011validation-merror:0.17250\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.15368#011validation-merror:0.16840\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.15146#011validation-merror:0.16645\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.15016#011validation-merror:0.16520\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.14722#011validation-merror:0.16150\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.14631#011validation-merror:0.16020\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.14452#011validation-merror:0.15885\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.14262#011validation-merror:0.15690\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.14009#011validation-merror:0.15445\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.13919#011validation-merror:0.15385\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.13535#011validation-merror:0.14980\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.13311#011validation-merror:0.14770\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.13081#011validation-merror:0.14585\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.12935#011validation-merror:0.14420\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.12825#011validation-merror:0.14325\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.12615#011validation-merror:0.14100\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.12518#011validation-merror:0.13960\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.12415#011validation-merror:0.13890\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.12099#011validation-merror:0.13485\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.11899#011validation-merror:0.13305\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.11806#011validation-merror:0.13175\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.11681#011validation-merror:0.13050\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.11598#011validation-merror:0.12980\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.11502#011validation-merror:0.12865\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.11488#011validation-merror:0.12835\u001b[0m\n", "\n", - "2024-01-25 14:17:00 Uploading - Uploading generated training model\u001b[34m[90]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "2024-02-04 13:06:22 Uploading - Uploading generated training model\u001b[34m[99]#011train-merror:0.11256#011validation-merror:0.12620\u001b[0m\n", "\n", - "2024-01-25 14:17:16 Completed - Training job completed\n", - "Training seconds: 198\n", - "Billable seconds: 198\n" + "2024-02-04 13:06:38 Completed - Training job completed\n", + "Training seconds: 308\n", + "Billable seconds: 308\n" ] } ], @@ -1540,7 +1839,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 35, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -1550,64 +1849,21 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 36, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, "outputs": [ { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "_start->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "postprocess\n", - "\n", - "postprocess\n", - "\n", - "\n", - "\n", - "xgboost-model->postprocess\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" + "ename": "MLRunInvalidArgumentError", + "evalue": "graph topology is already set, cannot be overwritten", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mMLRunInvalidArgumentError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[36], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Set the topology and get the graph object:\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m graph \u001b[38;5;241m=\u001b[39m \u001b[43mserving_function\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mset_topology\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mflow\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43masync\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Add the steps:\u001b[39;00m\n\u001b[1;32m 5\u001b[0m graph\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mXGBModelServer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxgboost-model\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m model_path\u001b[38;5;241m=\u001b[39mxgb\u001b[38;5;241m.\u001b[39mmodel_data) \\\n\u001b[1;32m 8\u001b[0m \u001b[38;5;241m.\u001b[39mto(handler\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpostprocess\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpostprocess\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mrespond()\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:282\u001b[0m, in \u001b[0;36mServingRuntime.set_topology\u001b[0;34m(self, topology, class_name, engine, exist_ok, **class_args)\u001b[0m\n\u001b[1;32m 280\u001b[0m topology \u001b[38;5;241m=\u001b[39m topology \u001b[38;5;129;01mor\u001b[39;00m StepKinds\u001b[38;5;241m.\u001b[39mrouter\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspec\u001b[38;5;241m.\u001b[39mgraph \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exist_ok:\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mlrun\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mMLRunInvalidArgumentError(\n\u001b[1;32m 283\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph topology is already set, cannot be overwritten\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 284\u001b[0m )\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m topology \u001b[38;5;241m==\u001b[39m StepKinds\u001b[38;5;241m.\u001b[39mrouter:\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m class_name \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(class_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_dict\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", + "\u001b[0;31mMLRunInvalidArgumentError\u001b[0m: graph topology is already set, cannot be overwritten" + ] } ], "source": [ @@ -1626,7 +1882,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 33, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, "outputs": [ @@ -1634,25 +1890,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:17:46,696 [info] Starting remote function deploy\n", - "2024-01-25 14:17:46 (info) Deploying function\n", - "2024-01-25 14:17:46 (info) Building\n", - "2024-01-25 14:17:47 (info) Staging files and preparing base images\n", - "2024-01-25 14:17:47 (info) Building processor image\n", - "2024-01-25 14:19:32 (info) Build complete\n", - "2024-01-25 14:19:40 (info) Function deploy complete\n", - "> 2024-01-25 14:19:48,105 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/']}\n" + "> 2024-02-04 13:07:10,636 [info] Starting remote function deploy\n", + "2024-02-04 13:07:11 (info) Deploying function\n", + "2024-02-04 13:07:11 (info) Building\n", + "2024-02-04 13:07:11 (info) Staging files and preparing base images\n", + "2024-02-04 13:07:11 (info) Building processor image\n", + "2024-02-04 13:08:56 (info) Build complete\n", + "Failed to deploy. Details:\n", + "Caught unhandled exception while initializing [worker_id=\"0\" || err=\"module 'pandas' has no attribute 'Dataframe'\" || traceback=\"Traceback (most recent call last):\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 447, in run_wrapper\n", + " wrapper_instance = Wrapper(root_logger,\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 82, in __init__\n", + " self._entrypoint = self._load_entrypoint_from_handler(handler)\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 234, in _load_entrypoint_from_handler\n", + " module = __import__(module_name)\n", + " File \"/opt/nuclio/serving.py\", line 56, in \n", + " def preprocess(data: pd.Dataframe):\n", + "AttributeError: module 'pandas' has no attribute 'Dataframe'\n", + "\"]\n", + "> 2024-02-04 13:09:22,537 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" ] }, { - "data": { - "text/plain": [ - "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/', 'name': 'sagemaker-yoni-serving'})" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" + "ename": "RunError", + "evalue": "Function serving deployment failed", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:586\u001b[0m, in \u001b[0;36mRemoteRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_credentials_from_remote_build(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 584\u001b[0m \u001b[38;5;66;03m# when a function is deployed, we wait for it to be ready by default\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;66;03m# this also means that the function object will be updated with the function status\u001b[39;00m\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_function_deployment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[38;5;66;03m# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort\u001b[39;00m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# now, functions can be not exposed (using service type ClusterIP) and hence\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;66;03m# for BC we first try to populate the external invocation url, and then\u001b[39;00m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;66;03m# if not exists, take the internal invocation url\u001b[39;00m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mexternal_invocation_urls:\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:633\u001b[0m, in \u001b[0;36mRemoteRuntime._wait_for_function_deployment\u001b[0;34m(self, db, verbose)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 632\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNuclio function failed to deploy\u001b[39m\u001b[38;5;124m\"\u001b[39m, function_state\u001b[38;5;241m=\u001b[39mstate)\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RunError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFunction \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m deployment failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mRunError\u001b[0m: Function serving deployment failed" + ] } ], "source": [ @@ -1661,7 +1934,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "c858e3e9-9e43-4148-8015-6047565db456", "metadata": {}, "outputs": [], @@ -2347,9 +2620,9 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "mlrun-base", + "display_name": "smdemo", "language": "python", - "name": "conda-env-mlrun-base-py" + "name": "smdemo" }, "language_info": { "codemirror_mode": { @@ -2361,7 +2634,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/src/functions/serving.py b/src/functions/serving.py index 7262df9..50de02a 100644 --- a/src/functions/serving.py +++ b/src/functions/serving.py @@ -8,6 +8,8 @@ import xgboost as xgb from cloudpickle import load +import mlrun.feature_store as fstore + warnings.filterwarnings("ignore") @@ -53,7 +55,7 @@ def _set_model_path(self): self.model_path = model_path # Function that preprocesses the inference data -def preprocess(data: pd.Dataframe): +def preprocess(self, data: pd.Dataframe): unique_categories = data.transaction_category.unique() # Create a feature vector that gets the average amount vector = fstore.FeatureVector("transactions_vector", ["aggregations.amount_avg_1d"], with_indexes=True) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..7455507 --- /dev/null +++ b/utils.py @@ -0,0 +1,46 @@ +import pandas as pd +from datetime import datetime, timedelta + +# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day) +def update_timestamps(data): + # Get today's date + today = datetime.today() + + # Calculate the dates for the last 5 days + last_5_days = [today - timedelta(days=i) for i in range(4, -1, -1)] # Reverse for chronological order + + # Extract year, month, and day from each date object + years = [d.year for d in last_5_days] + months = [d.month for d in last_5_days] + days = [d.day for d in last_5_days] + + hours = [10, 15] + + # Create a list of timestamps of the last 5 days, 2 timestamps per day. + times = [] + for year, month, day in zip(years, months, days): + for hour in hours: + times.append(datetime(year, month, day, hour)) + + # Iterate over each transaction category + for i in range(len(data["transaction_category"].unique())): + # Extract all the rows for each category + category_data = data[data['transaction_category'] == str(i)] + + # Ensure timestamp is a datetime object + pd.to_datetime(category_data.timestamp) + + # Sort DataFrame by timestamp in descending order + category_data_sorted = category_data.sort_values(by='timestamp', ascending=False) + + # Select the latest rows and update their timestamp + latest_rows = category_data_sorted.head(len(times)) + latest_rows.loc[:, 'timestamp'] = times + + # Update the initial dataframe to include those updated rows + data.update(latest_rows) + + data.sort_values(["transaction_category", "timestamp"], inplace=True) + + + return data \ No newline at end of file From 1f5547fb9da2872dbb2ac4650a2b3846bef09d75 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 5 Feb 2024 16:41:13 +0000 Subject: [PATCH 02/16] updating serving function --- financial_payment_classification_v2.ipynb | 613 +++++++++------------- src/functions/serving.py | 73 +-- 2 files changed, 288 insertions(+), 398 deletions(-) diff --git a/financial_payment_classification_v2.ipynb b/financial_payment_classification_v2.ipynb index 391bacb..64b39f1 100644 --- a/financial_payment_classification_v2.ipynb +++ b/financial_payment_classification_v2.ipynb @@ -76,15 +76,15 @@ }, "outputs": [], "source": [ - "import sys\n", - "!{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", - "!{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", - "!{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" + "# import sys\n", + "# !{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", + "# !{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", + "# !{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", "metadata": {}, "outputs": [], @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", "metadata": { "editable": true, @@ -109,7 +109,7 @@ "output_type": "stream", "text": [ "Project Source: git://github.com/mlrun/demo-sagemaker#development\n", - "> 2024-02-04 11:28:21,287 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" + "> 2024-02-04 15:29:54,524 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" ] } ], @@ -134,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "42c5d6d0", "metadata": {}, "outputs": [ @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", "metadata": {}, "outputs": [], @@ -173,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", "metadata": {}, "outputs": [], @@ -193,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "c0e4db17", "metadata": {}, "outputs": [], @@ -218,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "43946b9f", "metadata": {}, "outputs": [], @@ -258,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "5ff0d280", "metadata": {}, "outputs": [], @@ -281,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "a477abd7", "metadata": {}, "outputs": [], @@ -325,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", "metadata": {}, "outputs": [], @@ -336,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", "metadata": {}, "outputs": [ @@ -354,7 +354,7 @@ "Length: 19, dtype: string" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -373,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "ea2ebdd5", "metadata": {}, "outputs": [], @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", "metadata": { "scrolled": true @@ -642,7 +642,7 @@ "[99997 rows x 5 columns]" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -676,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 15, "id": "3c621044-681a-4e1a-9968-f637ed992539", "metadata": {}, "outputs": [], @@ -772,7 +772,7 @@ "
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 16, @@ -983,6 +983,14 @@ "df_with_cat_avg" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "510e4ca6-51bd-4431-837e-a634ed1a38ba", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 19, @@ -1051,7 +1059,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 19, @@ -1081,7 +1089,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", "metadata": {}, "outputs": [ @@ -1089,7 +1097,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-04 12:59:20,537 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + "> 2024-02-04 15:43:38,187 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" ] }, { @@ -1462,7 +1470,7 @@ "[99997 rows x 30 columns]" ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1512,7 +1520,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "bb4bdd8d", "metadata": {}, "outputs": [], @@ -1533,7 +1541,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -1553,7 +1561,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -1579,7 +1587,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -1597,7 +1605,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1620,7 +1628,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1645,7 +1653,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 27, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1673,7 +1681,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 28, "id": "c24e06fc", "metadata": { "scrolled": true @@ -1683,39 +1691,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-04-12-59-56-016\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-04-15-43-54-687\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-04 12:59:56 Starting - Starting the training job......\n", - "2024-02-04 13:00:31 Starting - Preparing the instances for training......\n", - "2024-02-04 13:01:31 Downloading - Downloading input data...\n", - "2024-02-04 13:02:01 Downloading - Downloading the training image...\n", - "2024-02-04 13:02:51 Training - Training image download completed. Training in progress....\u001b[34m[2024-02-04 13:03:08.197 ip-10-0-241-146.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-04 15:43:54 Starting - Starting the training job......\n", + "2024-02-04 15:44:40 Starting - Preparing the instances for training......\n", + "2024-02-04 15:45:37 Downloading - Downloading input data...\n", + "2024-02-04 15:46:07 Downloading - Downloading the training image......\n", + "2024-02-04 15:47:02 Training - Training image download completed. Training in progress..\u001b[34m[2024-02-04 15:47:19.110 ip-10-0-250-195.us-east-2.compute.internal:6 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:08.488 ip-10-0-241-146.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:08.489 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:08.489 ip-10-0-241-146.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-04:13:03:08:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:08.490 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:08.490 ip-10-0-241-146.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:19.401 ip-10-0-250-195.us-east-2.compute.internal:6 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:19.402 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:19.402 ip-10-0-250-195.us-east-2.compute.internal:6 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:19.403 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:19.403 ip-10-0-250-195.us-east-2.compute.internal:6 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-04:15:47:19:INFO] Debug hook created from config\u001b[0m\n", "\u001b[34m[0]#011train-merror:0.54515#011validation-merror:0.55430\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:10.570 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-04 13:03:10.575 ip-10-0-241-146.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:21.396 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-04 15:47:21.399 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:486] Hook is writing from the hook with pid: 6\u001b[0m\n", "\u001b[34m[1]#011train-merror:0.53387#011validation-merror:0.54255\u001b[0m\n", "\u001b[34m[2]#011train-merror:0.52198#011validation-merror:0.53050\u001b[0m\n", "\u001b[34m[3]#011train-merror:0.51036#011validation-merror:0.52010\u001b[0m\n", @@ -1811,15 +1819,15 @@ "\u001b[34m[93]#011train-merror:0.11899#011validation-merror:0.13305\u001b[0m\n", "\u001b[34m[94]#011train-merror:0.11806#011validation-merror:0.13175\u001b[0m\n", "\u001b[34m[95]#011train-merror:0.11681#011validation-merror:0.13050\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.11598#011validation-merror:0.12980\u001b[0m\n", + "\n", + "2024-02-04 15:50:23 Uploading - Uploading generated training model\u001b[34m[96]#011train-merror:0.11598#011validation-merror:0.12980\u001b[0m\n", "\u001b[34m[97]#011train-merror:0.11502#011validation-merror:0.12865\u001b[0m\n", "\u001b[34m[98]#011train-merror:0.11488#011validation-merror:0.12835\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.11256#011validation-merror:0.12620\u001b[0m\n", "\n", - "2024-02-04 13:06:22 Uploading - Uploading generated training model\u001b[34m[99]#011train-merror:0.11256#011validation-merror:0.12620\u001b[0m\n", - "\n", - "2024-02-04 13:06:38 Completed - Training job completed\n", - "Training seconds: 308\n", - "Billable seconds: 308\n" + "2024-02-04 15:50:39 Completed - Training job completed\n", + "Training seconds: 303\n", + "Billable seconds: 303\n" ] } ], @@ -1839,7 +1847,28 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 29, + "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xgb.model_data" + ] + }, + { + "cell_type": "code", + "execution_count": 30, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -1849,21 +1878,76 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, "outputs": [ { - "ename": "MLRunInvalidArgumentError", - "evalue": "graph topology is already set, cannot be overwritten", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mMLRunInvalidArgumentError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[36], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Set the topology and get the graph object:\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m graph \u001b[38;5;241m=\u001b[39m \u001b[43mserving_function\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mset_topology\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mflow\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43masync\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Add the steps:\u001b[39;00m\n\u001b[1;32m 5\u001b[0m graph\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mXGBModelServer\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxgboost-model\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m model_path\u001b[38;5;241m=\u001b[39mxgb\u001b[38;5;241m.\u001b[39mmodel_data) \\\n\u001b[1;32m 8\u001b[0m \u001b[38;5;241m.\u001b[39mto(handler\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpostprocess\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpostprocess\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mrespond()\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:282\u001b[0m, in \u001b[0;36mServingRuntime.set_topology\u001b[0;34m(self, topology, class_name, engine, exist_ok, **class_args)\u001b[0m\n\u001b[1;32m 280\u001b[0m topology \u001b[38;5;241m=\u001b[39m topology \u001b[38;5;129;01mor\u001b[39;00m StepKinds\u001b[38;5;241m.\u001b[39mrouter\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspec\u001b[38;5;241m.\u001b[39mgraph \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m exist_ok:\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mlrun\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mMLRunInvalidArgumentError(\n\u001b[1;32m 283\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgraph topology is already set, cannot be overwritten\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 284\u001b[0m )\n\u001b[1;32m 286\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m topology \u001b[38;5;241m==\u001b[39m StepKinds\u001b[38;5;241m.\u001b[39mrouter:\n\u001b[1;32m 287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m class_name \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(class_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_dict\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", - "\u001b[0;31mMLRunInvalidArgumentError\u001b[0m: graph topology is already set, cannot be overwritten" - ] + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "preprocess\n", + "\n", + "preprocess\n", + "\n", + "\n", + "\n", + "_start->preprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "xgboost-model\n", + "\n", + "xgboost-model\n", + "\n", + "\n", + "\n", + "preprocess->xgboost-model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "postprocess\n", + "\n", + "postprocess\n", + "\n", + "\n", + "\n", + "xgboost-model->postprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1871,7 +1955,8 @@ "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", "\n", "# Add the steps:\n", - "graph.to(\"XGBModelServer\",\n", + "graph.to(handler=\"preprocess\", name=\"preprocess\") \\\n", + " .to(\"XGBModelServer\",\n", " name=\"xgboost-model\",\n", " model_path=xgb.model_data) \\\n", " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", @@ -1882,7 +1967,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, "outputs": [ @@ -1890,42 +1975,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-04 13:07:10,636 [info] Starting remote function deploy\n", - "2024-02-04 13:07:11 (info) Deploying function\n", - "2024-02-04 13:07:11 (info) Building\n", - "2024-02-04 13:07:11 (info) Staging files and preparing base images\n", - "2024-02-04 13:07:11 (info) Building processor image\n", - "2024-02-04 13:08:56 (info) Build complete\n", - "Failed to deploy. Details:\n", - "Caught unhandled exception while initializing [worker_id=\"0\" || err=\"module 'pandas' has no attribute 'Dataframe'\" || traceback=\"Traceback (most recent call last):\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 447, in run_wrapper\n", - " wrapper_instance = Wrapper(root_logger,\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 82, in __init__\n", - " self._entrypoint = self._load_entrypoint_from_handler(handler)\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 234, in _load_entrypoint_from_handler\n", - " module = __import__(module_name)\n", - " File \"/opt/nuclio/serving.py\", line 56, in \n", - " def preprocess(data: pd.Dataframe):\n", - "AttributeError: module 'pandas' has no attribute 'Dataframe'\n", - "\"]\n", - "> 2024-02-04 13:09:22,537 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" + "> 2024-02-04 15:51:09,295 [info] Starting remote function deploy\n", + "2024-02-04 15:51:09 (info) Deploying function\n", + "2024-02-04 15:51:09 (info) Building\n", + "2024-02-04 15:51:10 (info) Staging files and preparing base images\n", + "2024-02-04 15:51:10 (info) Building processor image\n", + "2024-02-04 15:52:46 (info) Build complete\n", + "2024-02-04 15:52:46 (info) Function deploy complete\n", + "> 2024-02-04 15:52:51,661 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-admin-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" ] }, { - "ename": "RunError", - "evalue": "Function serving deployment failed", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:586\u001b[0m, in \u001b[0;36mRemoteRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_credentials_from_remote_build(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 584\u001b[0m \u001b[38;5;66;03m# when a function is deployed, we wait for it to be ready by default\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;66;03m# this also means that the function object will be updated with the function status\u001b[39;00m\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_function_deployment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[38;5;66;03m# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort\u001b[39;00m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# now, functions can be not exposed (using service type ClusterIP) and hence\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;66;03m# for BC we first try to populate the external invocation url, and then\u001b[39;00m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;66;03m# if not exists, take the internal invocation url\u001b[39;00m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mexternal_invocation_urls:\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:633\u001b[0m, in \u001b[0;36mRemoteRuntime._wait_for_function_deployment\u001b[0;34m(self, db, verbose)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 632\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNuclio function failed to deploy\u001b[39m\u001b[38;5;124m\"\u001b[39m, function_state\u001b[38;5;241m=\u001b[39mstate)\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RunError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFunction \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m deployment failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mRunError\u001b[0m: Function serving deployment failed" - ] + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-admin-serving'})" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1934,7 +2002,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "c858e3e9-9e43-4148-8015-6047565db456", "metadata": {}, "outputs": [], @@ -1944,7 +2012,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", "metadata": {}, "outputs": [ @@ -1952,7 +2020,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:19:48,167 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080/predict'}\n" + "> 2024-02-04 15:52:51,734 [info] invoking function: {'method': 'POST', 'path': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//predict'}\n" ] } ], @@ -1972,7 +2040,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "id": "2e863ea7-5804-4637-b677-390c305cabfe", "metadata": {}, "outputs": [], @@ -1990,7 +2058,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 36, "id": "ca4f7e49", "metadata": {}, "outputs": [], @@ -2008,7 +2076,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", "metadata": {}, "outputs": [ @@ -2016,19 +2084,39 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:19:48,410 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': 'cac9cd3c55ba40d58fbe1156d4861e79', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-01-25 14:19:48,708 [info] Job is running in the background, pod: evaluate-evaluate-5rrtk\n", - "[14:19:52] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", - "configuration generated by an older version of XGBoost, please export the model by calling\n", - "`Booster.save_model` from that version first, then load it back in current version. See:\n", - "\n", - " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", + "> 2024-02-04 15:52:51,982 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '6c838c8bbb234d7eb642d9401eff7068', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", + "> 2024-02-04 15:52:52,382 [info] Job is running in the background, pod: evaluate-evaluate-hdk2c\n", + "> 2024-02-04 15:52:56,798 [error] Execution error, Traceback (most recent call last):\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/local.py\", line 475, in exec_from_params\n", + " val = mlrun.handler(\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/package/__init__.py\", line 140, in wrapper\n", + " func_outputs = func(*args, **kwargs)\n", + " File \"evaluate.py\", line 44, in evaluate\n", + " model_temp_path = _download_object_from_s3(model_path, suffix=\".tar.gz\")\n", + " File \"evaluate.py\", line 88, in _download_object_from_s3\n", + " obj.download(temp_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", + " self._store.download(self._path, target_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", + " data = self.get(key)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", + " return obj.get()[\"Body\"].read()\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", + " response = action(self, *args, **kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", + " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", + " return self._make_api_call(operation_name, kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", + " raise error_class(parsed_response, operation_name)\n", + "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", "\n", - "for more details about differences between saving model and serializing.\n", - "\n", - "> 2024-01-25 14:19:53,802 [info] To track results use the CLI: {'info_cmd': 'mlrun get run cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni', 'logs_cmd': 'mlrun logs cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni'}\n", - "> 2024-01-25 14:19:53,802 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/sagemaker-yoni/jobs/monitor/cac9cd3c55ba40d58fbe1156d4861e79/overview'}\n", - "> 2024-01-25 14:19:53,803 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + "> 2024-02-04 15:52:56,835 [error] Exec error - An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", + "An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", + "> 2024-02-04 15:52:56,879 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 6c838c8bbb234d7eb642d9401eff7068 -p sagemaker-admin', 'logs_cmd': 'mlrun logs 6c838c8bbb234d7eb642d9401eff7068 -p sagemaker-admin'}\n", + "> 2024-02-04 15:52:56,879 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-admin/jobs/monitor/6c838c8bbb234d7eb642d9401eff7068/overview'}\n", + "> 2024-02-04 15:52:56,880 [info] Run execution finished: {'status': 'error', 'name': 'evaluate-evaluate'}\n", + "Runtime error: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n" ] }, { @@ -2201,27 +2289,27 @@ " \n", " \n", " \n", - " sagemaker-yoni\n", - " \n", + " sagemaker-admin\n", + " \n", " 0\n", - " Jan 25 14:19:51\n", - " completed\n", + " Feb 04 15:52:55\n", + "
error
\n", " evaluate-evaluate\n", - "
v3io_user=yoni
kind=job
owner=yoni
mlrun/client_version=1.6.0-rc21
mlrun/client_python_version=3.9.16
host=evaluate-evaluate-5rrtk
\n", + "
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc22
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-hdk2c
\n", + " \n", + "
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
\n", " \n", - "
model_path=s3://sagemaker-us-east-1-934638699319/payment-classification/output/sagemaker-xgboost-2024-01-25-14-12-01-149/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-1-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
\n", " \n", - "
classification_report
\n", " \n", " \n", "\n", "\n", - "
\n", + "
\n", "
\n", - " Title\n", - " ×\n", + " Title\n", + " ×\n", "
\n", - " \n", + " \n", "
\n", "
\n" ], @@ -2242,7 +2330,7 @@ { "data": { "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" + " > to track results use the .show() or .logs() methods or click here to open in UI" ], "text/plain": [ "" @@ -2255,7 +2343,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:19:59,831 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + "> 2024-02-04 15:53:03,539 [info] Run execution finished: {'status': 'error', 'name': 'evaluate-evaluate'}\n", + "> 2024-02-04 15:53:03,540 [error] Run did not finish successfully: {'state': 'error', 'status': {'state': 'error', 'error': 'An error occurred (AccessDenied) when calling the GetObject operation: Access Denied', 'artifacts': [], 'start_time': '2024-02-04T15:52:55.840221+00:00', 'last_update': '2024-02-04T15:52:56.873792+00:00'}}\n" + ] + }, + { + "ename": "RunError", + "evalue": "An error occurred (AccessDenied) when calling the GetObject operation: Access Denied", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[37], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m evaluate_run \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate_function\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mevaluate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_path\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mxgb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mxgboost-model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtest_set\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43ms3_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabel_column\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtransaction_category\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfactorize_key\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfactorize_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclassification_report: dataset\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/base.py:369\u001b[0m, in \u001b[0;36mBaseRuntime.run\u001b[0;34m(self, runspec, handler, name, project, params, inputs, out_path, workdir, artifact_path, watch, schedule, hyperparams, hyper_param_options, verbose, scrape_metrics, local, local_code_path, auto_build, param_file_secrets, notifications, returns, state_thresholds, **launcher_kwargs)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 313\u001b[0m \u001b[38;5;124;03mRun a local or remote task.\u001b[39;00m\n\u001b[1;32m 314\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;124;03m:return: Run context object (RunObject) with run metadata, results and status\u001b[39;00m\n\u001b[1;32m 365\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 366\u001b[0m launcher \u001b[38;5;241m=\u001b[39m mlrun\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mfactory\u001b[38;5;241m.\u001b[39mLauncherFactory()\u001b[38;5;241m.\u001b[39mcreate_launcher(\n\u001b[1;32m 367\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_remote, local\u001b[38;5;241m=\u001b[39mlocal, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mlauncher_kwargs\n\u001b[1;32m 368\u001b[0m )\n\u001b[0;32m--> 369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlauncher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m \u001b[49m\u001b[43mruntime\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrunspec\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhandler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 376\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 377\u001b[0m \u001b[43m \u001b[49m\u001b[43mout_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mout_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 378\u001b[0m \u001b[43m \u001b[49m\u001b[43mworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mworkdir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 379\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43mwatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 381\u001b[0m \u001b[43m \u001b[49m\u001b[43mschedule\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 382\u001b[0m \u001b[43m \u001b[49m\u001b[43mhyperparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhyperparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 383\u001b[0m \u001b[43m \u001b[49m\u001b[43mhyper_param_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhyper_param_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mscrape_metrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscrape_metrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_code_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_code_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43mauto_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mauto_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mparam_file_secrets\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparam_file_secrets\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mnotifications\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnotifications\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_thresholds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstate_thresholds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/launcher/remote.py:113\u001b[0m, in \u001b[0;36mClientRemoteLauncher.launch\u001b[0;34m(self, runtime, task, handler, name, project, params, inputs, out_path, workdir, artifact_path, watch, schedule, hyperparams, hyper_param_options, verbose, scrape_metrics, local_code_path, auto_build, param_file_secrets, notifications, returns, state_thresholds)\u001b[0m\n\u001b[1;32m 105\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 106\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStoring function\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 107\u001b[0m name\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m 108\u001b[0m uid\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39muid,\n\u001b[1;32m 109\u001b[0m db\u001b[38;5;241m=\u001b[39mruntime\u001b[38;5;241m.\u001b[39mspec\u001b[38;5;241m.\u001b[39mrundb,\n\u001b[1;32m 110\u001b[0m )\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store_function(runtime, run)\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_submit_job\u001b[49m\u001b[43m(\u001b[49m\u001b[43mruntime\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mschedule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwatch\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/launcher/remote.py:182\u001b[0m, in \u001b[0;36mClientRemoteLauncher._submit_job\u001b[0;34m(self, runtime, run, schedule, watch)\u001b[0m\n\u001b[1;32m 179\u001b[0m run\u001b[38;5;241m.\u001b[39mlogs(\u001b[38;5;28;01mTrue\u001b[39;00m, runtime\u001b[38;5;241m.\u001b[39m_get_db())\n\u001b[1;32m 180\u001b[0m resp \u001b[38;5;241m=\u001b[39m runtime\u001b[38;5;241m.\u001b[39m_get_db_run(run)\n\u001b[0;32m--> 182\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wrap_run_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43mruntime\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mschedule\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/launcher/base.py:409\u001b[0m, in \u001b[0;36mBaseLauncher._wrap_run_result\u001b[0;34m(self, runtime, result, run, schedule, err)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m runtime\u001b[38;5;241m.\u001b[39m_is_remote \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m runtime\u001b[38;5;241m.\u001b[39mis_child:\n\u001b[1;32m 404\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\n\u001b[1;32m 405\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRun did not finish successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 406\u001b[0m state\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 407\u001b[0m status\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mto_dict(),\n\u001b[1;32m 408\u001b[0m )\n\u001b[0;32m--> 409\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mRunError(run\u001b[38;5;241m.\u001b[39merror)\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m run\n\u001b[1;32m 412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "\u001b[0;31mRunError\u001b[0m: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied" ] } ], @@ -2282,227 +2386,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
precisionrecallf1-scoresupport
Uncategorized1.0000001.0000001.00000051.0000
Entertainment1.0000001.0000001.0000001486.0000
Education1.0000001.0000001.00000080.0000
Shopping1.0000001.0000001.0000003441.0000
Personal Care1.0000001.0000001.000000132.0000
Health and Fitness1.0000001.0000001.000000443.0000
Food and Dining1.0000001.0000001.000000918.0000
Gifts and Donations1.0000001.0000001.000000275.0000
Investments1.0000001.0000001.00000088.0000
Bills and Utilities1.0000001.0000001.000000332.0000
Auto and Transport1.0000001.0000001.0000001967.0000
Travel1.0000001.0000001.000000120.0000
Fees and Charges1.0000001.0000001.000000106.0000
Business Services1.0000001.0000001.000000146.0000
Personal Services1.0000001.0000001.00000075.0000
Taxes1.0000000.9787230.98924747.0000
Gambling0.9375001.0000000.96774215.0000
Home1.0000001.0000001.000000168.0000
Pension and insurances1.0000001.0000001.000000110.0000
accuracy0.9999000.9999000.9999000.9999
macro avg0.9967110.9988800.99773610000.0000
weighted avg0.9999060.9999000.99990110000.0000
\n", - "
" - ], - "text/plain": [ - " precision recall f1-score support\n", - "Uncategorized 1.000000 1.000000 1.000000 51.0000\n", - "Entertainment 1.000000 1.000000 1.000000 1486.0000\n", - "Education 1.000000 1.000000 1.000000 80.0000\n", - "Shopping 1.000000 1.000000 1.000000 3441.0000\n", - "Personal Care 1.000000 1.000000 1.000000 132.0000\n", - "Health and Fitness 1.000000 1.000000 1.000000 443.0000\n", - "Food and Dining 1.000000 1.000000 1.000000 918.0000\n", - "Gifts and Donations 1.000000 1.000000 1.000000 275.0000\n", - "Investments 1.000000 1.000000 1.000000 88.0000\n", - "Bills and Utilities 1.000000 1.000000 1.000000 332.0000\n", - "Auto and Transport 1.000000 1.000000 1.000000 1967.0000\n", - "Travel 1.000000 1.000000 1.000000 120.0000\n", - "Fees and Charges 1.000000 1.000000 1.000000 106.0000\n", - "Business Services 1.000000 1.000000 1.000000 146.0000\n", - "Personal Services 1.000000 1.000000 1.000000 75.0000\n", - "Taxes 1.000000 0.978723 0.989247 47.0000\n", - "Gambling 0.937500 1.000000 0.967742 15.0000\n", - "Home 1.000000 1.000000 1.000000 168.0000\n", - "Pension and insurances 1.000000 1.000000 1.000000 110.0000\n", - "accuracy 0.999900 0.999900 0.999900 0.9999\n", - "macro avg 0.996711 0.998880 0.997736 10000.0000\n", - "weighted avg 0.999906 0.999900 0.999901 10000.0000" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "evaluate_run.artifact(\"classification_report\").as_df()" ] @@ -2561,7 +2448,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "f79b1164", "metadata": {}, "outputs": [], diff --git a/src/functions/serving.py b/src/functions/serving.py index 50de02a..c600f36 100644 --- a/src/functions/serving.py +++ b/src/functions/serving.py @@ -4,12 +4,11 @@ import mlrun import numpy as np -import pandas as pd import xgboost as xgb from cloudpickle import load - import mlrun.feature_store as fstore + warnings.filterwarnings("ignore") @@ -53,39 +52,6 @@ def _set_model_path(self): # set model path: self.model_path = model_path - -# Function that preprocesses the inference data -def preprocess(self, data: pd.Dataframe): - unique_categories = data.transaction_category.unique() - # Create a feature vector that gets the average amount - vector = fstore.FeatureVector("transactions_vector", ["aggregations.amount_avg_1d"], with_indexes=True) - - # Use online feature service to get the latest average amount per category - with vector.get_online_feature_service() as online_feature_service: - resp = online_feature_service.get( - [{"transaction_category":cat} for cat in unique_categories] - ) - - for cat in resp: - transaction_category = cat['transaction_category'] - amount_avg = cat['amount_avg_1d'] - data["dist_" + transaction_category] = abs(amount_avg - data["amount"]) - - # convert timestamp to components - data["year"] = data["timestamp"].dt.year - data["month"] = data["timestamp"].dt.month - data["day"] = data["timestamp"].dt.day - data["hour"] = data["timestamp"].dt.hour - data["minute"] = data["timestamp"].dt.minute - data["second"] = data["timestamp"].dt.second - - del data["timestamp"] - del data["transaction_category"] - - return data - - - def postprocess(inputs: dict) -> dict: @@ -108,3 +74,40 @@ def postprocess(inputs: dict) -> dict: inputs["predictions"] = predictions inputs["confidences"] = confidences return inputs + +# Function that preprocesses the inference data +def preprocess(event): + print("--------------------") + print(event) + + # Create a feature vector that gets the average amount + vector = fstore.FeatureVector("transactions_vector", ["aggregations.amount_avg_1d"], with_indexes=True) + unique_categories = ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"] + # Use online feature service to get the latest average amount per category + with vector.get_online_feature_service() as online_feature_service: + resp = online_feature_service.get( + [{"transaction_category":cat} for cat in unique_categories] + ) + + print('---------') + print(resp) + + for cat in resp: + transaction_category = cat['transaction_category'] + amount_avg = cat['amount_avg_1d'] + event[0]["dist_" + transaction_category] = abs(amount_avg - event[0]["amount"]) + + print(event) + # # convert timestamp to components + # event["year"] = event["timestamp"].dt.year + # event["month"] = event["timestamp"].dt.month + # event["day"] = event["timestamp"].dt.day + # event["hour"] = event["timestamp"].dt.hour + # event["minute"] = event["timestamp"].dt.minute + # event["second"] = event["timestamp"].dt.second + + # del data["timestamp"] + # del data["transaction_category"] + event_list = list(list(event[0].values())) + + return event_list \ No newline at end of file From bee786b67b7cf9cb5bd76fc7274c70d13decceac Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Tue, 6 Feb 2024 10:17:24 +0000 Subject: [PATCH 03/16] add serving example --- financial_payment_classification_v2.ipynb | 517 ++++++------ serving-Copy1.ipynb | 328 ++++++++ serving.ipynb | 955 ++++++++++++++++++++++ src/functions/serving.py | 64 +- 4 files changed, 1610 insertions(+), 254 deletions(-) create mode 100644 serving-Copy1.ipynb create mode 100644 serving.ipynb diff --git a/financial_payment_classification_v2.ipynb b/financial_payment_classification_v2.ipynb index 64b39f1..7310897 100644 --- a/financial_payment_classification_v2.ipynb +++ b/financial_payment_classification_v2.ipynb @@ -108,14 +108,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Project Source: git://github.com/mlrun/demo-sagemaker#development\n", - "> 2024-02-04 15:29:54,524 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" + "> 2024-02-06 08:47:06,901 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" ] } ], "source": [ "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker\", \n", + " name=\"sagemaker-v2\", \n", " user_project=True,\n", " parameters={\n", " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", @@ -153,9 +152,7 @@ "import sagemaker\n", "import time\n", "import os\n", - "from time import sleep\n", "from sklearn.metrics import classification_report\n", - "from sagemaker.feature_store.feature_group import FeatureGroup\n", "import pandas as pd\n", "import numpy as np\n", "from datetime import datetime, timedelta" @@ -202,7 +199,6 @@ "sm_client = boto3.client(\"sagemaker\")\n", "boto_session = boto3.Session(region_name=region)\n", "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", - "#role = sagemaker.get_execution_role()\n", "role = sagemaker_role\n", "bucket_prefix = \"payment-classification\"\n", "s3_bucket = sagemaker_session.default_bucket()" @@ -337,43 +333,6 @@ { "cell_type": "code", "execution_count": 12, - "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - "[ 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances']\n", - "Length: 19, dtype: string" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data[\"transaction_category\"].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "f7314f8a", - "metadata": {}, - "source": [ - "We'll transform the transaction categories to numeric targets for the classification by factorization." - ] - }, - { - "cell_type": "code", - "execution_count": 13, "id": "ea2ebdd5", "metadata": {}, "outputs": [], @@ -383,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", "metadata": { "scrolled": true @@ -573,7 +532,7 @@ " 4.904096e+15\n", " 4.133603e+15\n", " 124.08\n", - " 2024-02-02 15:00:00\n", + " 2024-02-04 15:00:00\n", " \n", " \n", " 70592\n", @@ -581,7 +540,7 @@ " 4.904096e+15\n", " 4.444087e+15\n", " 188.66\n", - " 2024-02-03 10:00:00\n", + " 2024-02-05 10:00:00\n", " \n", " \n", " 70379\n", @@ -589,7 +548,7 @@ " 4.200241e+15\n", " 4.202495e+15\n", " 139.27\n", - " 2024-02-03 15:00:00\n", + " 2024-02-05 15:00:00\n", " \n", " \n", " 70462\n", @@ -597,7 +556,7 @@ " 4.612985e+15\n", " 4.525455e+15\n", " 12.49\n", - " 2024-02-04 10:00:00\n", + " 2024-02-06 10:00:00\n", " \n", " \n", " 71672\n", @@ -605,7 +564,7 @@ " 4.538817e+15\n", " 4.291294e+15\n", " 57.03\n", - " 2024-02-04 15:00:00\n", + " 2024-02-06 15:00:00\n", " \n", " \n", "\n", @@ -633,16 +592,16 @@ "17 2021-01-01 18:33:18 \n", "178 2021-01-01 19:33:31 \n", "... ... \n", - "69938 2024-02-02 15:00:00 \n", - "70592 2024-02-03 10:00:00 \n", - "70379 2024-02-03 15:00:00 \n", - "70462 2024-02-04 10:00:00 \n", - "71672 2024-02-04 15:00:00 \n", + "69938 2024-02-04 15:00:00 \n", + "70592 2024-02-05 10:00:00 \n", + "70379 2024-02-05 15:00:00 \n", + "70462 2024-02-06 10:00:00 \n", + "71672 2024-02-06 15:00:00 \n", "\n", "[99997 rows x 5 columns]" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -676,63 +635,7 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "3c621044-681a-4e1a-9968-f637ed992539", - "metadata": {}, - "outputs": [], - "source": [ - "# Function that gets a dataframe, creates daily dates and group by the date and the given column.\n", - "# It returns a dataframe where the index is daily dates, columns are the categories, and each value is the last average for that category for a given day\n", - "def get_last_transaction_avg_per_day(df, column_to_groupby):\n", - " df['date'] = pd.to_datetime(df['timestamp']).dt.date\n", - " df = df.groupby(['date', df.index])[column_to_groupby].last()\n", - " df = df.unstack(fill_value=0)\n", - " return df\n", - "\n", - "\n", - "# Function that gets a dataframe and calculates a moving average per category and the distance between the row's amount to each category average\n", - "def add_grouped_features(data):\n", - "\n", - " df_with_cat_avg = data\n", - " df_with_cat_avg.sort_values([\"transaction_category\", \"timestamp\"], inplace=True)\n", - " \n", - " # Convert the timestamp to daily date and remove the aggregated average\n", - " df_with_cat_avg['date'] = pd.to_datetime(df_with_cat_avg['timestamp']).dt.date\n", - " df_without_cat_avg = df_with_cat_avg.drop(\"amount_avg_1d\", axis=1)\n", - " \n", - " # Get the daily average per transaction category\n", - " df_with_all_cat_avg = get_last_transaction_avg_per_day(df_with_cat_avg, 'amount_avg_1d')\n", - " \n", - " # Now let's join the 2 dataframes + calculate distance from average\n", - " unique_categories = df_without_cat_avg.index.unique()\n", - " df_without_cat_avg = df_without_cat_avg.reset_index()\n", - " \n", - " # Join the 2 dataframes\n", - " df_merged = pd.merge(df_without_cat_avg, df_with_all_cat_avg, on='date', how='outer')\n", - "\n", - " # For each transaction_category, calculate the distance and remove the category column\n", - " for col in unique_categories:\n", - " df_merged[\"dist_\" + col] = abs(df_merged[col] - df_merged[\"amount\"])\n", - " df_merged.drop(col, axis=1, inplace=True)\n", - " \n", - " # Split the timestamp into components\n", - " df_merged[\"year\"] = df_merged[\"timestamp\"].dt.year\n", - " df_merged[\"month\"] = df_merged[\"timestamp\"].dt.month\n", - " df_merged[\"day\"] = df_merged[\"timestamp\"].dt.day\n", - " df_merged[\"hour\"] = df_merged[\"timestamp\"].dt.hour\n", - " df_merged[\"minute\"] = df_merged[\"timestamp\"].dt.minute\n", - " df_merged[\"second\"] = df_merged[\"timestamp\"].dt.second\n", - "\n", - " del df_merged[\"timestamp\"]\n", - " del df_merged[\"date\"] \n", - " df_merged['transaction_id']= df_merged.reset_index().index \n", - " \n", - " return df_merged" - ] - }, - { - "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "id": "07fdb07a-f3b7-4255-b38b-17a939b8676d", "metadata": {}, "outputs": [ @@ -772,10 +675,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -791,26 +694,14 @@ ")\n", "fset.add_aggregation(\"amount\", [\"avg\"], \"1d\")\n", "fset.set_targets()\n", - "fset.graph.plot()\n", - " \n", - "# Ingest the data (will perform the aggregation)\n" + "fset.graph.plot()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "e213e91e-276a-4cde-a2fc-059369cc837a", "metadata": {}, - "outputs": [], - "source": [ - "df_with_cat_avg = fset.ingest(data, return_df=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "19081c06-240e-481b-bfe3-588bb77bd54e", - "metadata": {}, "outputs": [ { "data": { @@ -903,7 +794,7 @@ " 4.904096e+15\n", " 4.133603e+15\n", " 124.08\n", - " 2024-02-02 15:00:00\n", + " 2024-02-04 15:00:00\n", " \n", " \n", " 9\n", @@ -911,7 +802,7 @@ " 4.904096e+15\n", " 4.444087e+15\n", " 188.66\n", - " 2024-02-03 10:00:00\n", + " 2024-02-05 10:00:00\n", " \n", " \n", " 9\n", @@ -919,7 +810,7 @@ " 4.200241e+15\n", " 4.202495e+15\n", " 139.27\n", - " 2024-02-03 15:00:00\n", + " 2024-02-05 15:00:00\n", " \n", " \n", " 9\n", @@ -927,7 +818,7 @@ " 4.612985e+15\n", " 4.525455e+15\n", " 12.49\n", - " 2024-02-04 10:00:00\n", + " 2024-02-06 10:00:00\n", " \n", " \n", " 9\n", @@ -935,7 +826,7 @@ " 4.538817e+15\n", " 4.291294e+15\n", " 57.03\n", - " 2024-02-04 15:00:00\n", + " 2024-02-06 15:00:00\n", " \n", " \n", "\n", @@ -965,31 +856,106 @@ "0 2021-01-01 18:33:18 \n", "0 2021-01-01 19:33:31 \n", "... ... \n", - "9 2024-02-02 15:00:00 \n", - "9 2024-02-03 10:00:00 \n", - "9 2024-02-03 15:00:00 \n", - "9 2024-02-04 10:00:00 \n", "9 2024-02-04 15:00:00 \n", + "9 2024-02-05 10:00:00 \n", + "9 2024-02-05 15:00:00 \n", + "9 2024-02-06 10:00:00 \n", + "9 2024-02-06 15:00:00 \n", "\n", "[99997 rows x 5 columns]" ] }, - "execution_count": 18, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "df_with_cat_avg = fset.ingest(data, return_df=True)\n", "df_with_cat_avg" ] }, { "cell_type": "code", - "execution_count": null, - "id": "510e4ca6-51bd-4431-837e-a634ed1a38ba", + "execution_count": 33, + "id": "50441ed4-a228-44e7-87ce-024177b928f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", + "\n", + "# create feature vector on top of aggreagations\n", + "# Define the list of features we will be using\n", + "features = ['aggregations.*']\n", + "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'aggreagations-vector'\n", + "\n", + "# Define the feature vector using our Feature Store (fstore)\n", + "aggregations_fv = fstore.FeatureVector(fv_name, \n", + " features, \n", + " description='stocks information')\n", + "\n", + "# Save the feature vector in the Feature Store\n", + "aggregations_fv.save()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "19081c06-240e-481b-bfe3-588bb77bd54e", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Function that gets a dataframe, creates daily dates and group by the date and the given column.\n", + "# It returns a dataframe where the index is daily dates, columns are the categories, and each value is the last average for that category for a given day\n", + "def get_last_transaction_avg_per_day(df, column_to_groupby):\n", + " df['date'] = pd.to_datetime(df['timestamp']).dt.date\n", + " df = df.groupby(['date', df.index])[column_to_groupby].last()\n", + " df = df.unstack(fill_value=0)\n", + " return df\n", + "\n", + "\n", + "# Function that gets a dataframe and calculates a moving average per category and the distance between the row's amount to each category average\n", + "def add_grouped_features(data):\n", + "\n", + " df_with_cat_avg = data\n", + " df_with_cat_avg.sort_values([\"transaction_category\", \"timestamp\"], inplace=True)\n", + " \n", + " # Convert the timestamp to daily date and remove the aggregated average\n", + " df_with_cat_avg['date'] = pd.to_datetime(df_with_cat_avg['timestamp']).dt.date\n", + " df_without_cat_avg = df_with_cat_avg.drop(\"amount_avg_1d\", axis=1)\n", + " \n", + " # Get the daily average per transaction category\n", + " df_with_all_cat_avg = get_last_transaction_avg_per_day(df_with_cat_avg, 'amount_avg_1d')\n", + " \n", + " # Now let's join the 2 dataframes + calculate distance from average\n", + " unique_categories = df_without_cat_avg.index.unique()\n", + " df_without_cat_avg = df_without_cat_avg.reset_index()\n", + " \n", + " # Join the 2 dataframes\n", + " df_merged = pd.merge(df_without_cat_avg, df_with_all_cat_avg, on='date', how='outer')\n", + "\n", + " # For each transaction_category, calculate the distance and remove the category column\n", + " for col in unique_categories:\n", + " df_merged[\"dist_\" + col] = abs(df_merged[col] - df_merged[\"amount\"])\n", + " df_merged.drop(col, axis=1, inplace=True)\n", + " \n", + " # Split the timestamp into components\n", + " df_merged[\"year\"] = df_merged[\"timestamp\"].dt.year\n", + " df_merged[\"month\"] = df_merged[\"timestamp\"].dt.month\n", + " df_merged[\"day\"] = df_merged[\"timestamp\"].dt.day\n", + " df_merged[\"hour\"] = df_merged[\"timestamp\"].dt.hour\n", + " df_merged[\"minute\"] = df_merged[\"timestamp\"].dt.minute\n", + " df_merged[\"second\"] = df_merged[\"timestamp\"].dt.second\n", + "\n", + " del df_merged[\"timestamp\"]\n", + " del df_merged[\"date\"] \n", + " df_merged['transaction_id']= df_merged.reset_index().index \n", + " \n", + " return df_merged" + ] }, { "cell_type": "code", @@ -1059,7 +1025,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 19, @@ -1097,7 +1063,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-04 15:43:38,187 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + "> 2024-02-06 08:03:53,056 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" ] }, { @@ -1691,39 +1657,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-04-15-43-54-687\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-06-08-05-13-165\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-04 15:43:54 Starting - Starting the training job......\n", - "2024-02-04 15:44:40 Starting - Preparing the instances for training......\n", - "2024-02-04 15:45:37 Downloading - Downloading input data...\n", - "2024-02-04 15:46:07 Downloading - Downloading the training image......\n", - "2024-02-04 15:47:02 Training - Training image download completed. Training in progress..\u001b[34m[2024-02-04 15:47:19.110 ip-10-0-250-195.us-east-2.compute.internal:6 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-06 08:05:13 Starting - Starting the training job......\n", + "2024-02-06 08:05:48 Starting - Preparing the instances for training...\n", + "2024-02-06 08:06:39 Downloading - Downloading input data...\n", + "2024-02-06 08:07:09 Downloading - Downloading the training image......\n", + "2024-02-06 08:07:59 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-06 08:08:16.516 ip-10-0-135-116.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:19.401 ip-10-0-250-195.us-east-2.compute.internal:6 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:19.402 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:19.402 ip-10-0-250-195.us-east-2.compute.internal:6 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:19.403 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:19.403 ip-10-0-250-195.us-east-2.compute.internal:6 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-04:15:47:19:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:16.809 ip-10-0-135-116.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:16.809 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:16.810 ip-10-0-135-116.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:16.811 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:16.811 ip-10-0-135-116.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-06:08:08:16:INFO] Debug hook created from config\u001b[0m\n", "\u001b[34m[0]#011train-merror:0.54515#011validation-merror:0.55430\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:21.396 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-04 15:47:21.399 ip-10-0-250-195.us-east-2.compute.internal:6 INFO hook.py:486] Hook is writing from the hook with pid: 6\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:18.833 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-06 08:08:18.836 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", "\u001b[34m[1]#011train-merror:0.53387#011validation-merror:0.54255\u001b[0m\n", "\u001b[34m[2]#011train-merror:0.52198#011validation-merror:0.53050\u001b[0m\n", "\u001b[34m[3]#011train-merror:0.51036#011validation-merror:0.52010\u001b[0m\n", @@ -1780,52 +1746,52 @@ "\u001b[34m[54]#011train-merror:0.22725#011validation-merror:0.24250\u001b[0m\n", "\u001b[34m[55]#011train-merror:0.22378#011validation-merror:0.23920\u001b[0m\n", "\u001b[34m[56]#011train-merror:0.22085#011validation-merror:0.23610\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.21870#011validation-merror:0.23400\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.21478#011validation-merror:0.23010\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.21048#011validation-merror:0.22455\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.20771#011validation-merror:0.22170\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.20252#011validation-merror:0.21665\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.19939#011validation-merror:0.21390\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.19668#011validation-merror:0.21095\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.19499#011validation-merror:0.20855\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.18875#011validation-merror:0.20205\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.18505#011validation-merror:0.19835\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.18059#011validation-merror:0.19375\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.17695#011validation-merror:0.19005\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.17485#011validation-merror:0.18875\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.17348#011validation-merror:0.18735\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.17201#011validation-merror:0.18580\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.16625#011validation-merror:0.18010\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.16215#011validation-merror:0.17615\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.15796#011validation-merror:0.17250\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.15368#011validation-merror:0.16840\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.15146#011validation-merror:0.16645\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.15016#011validation-merror:0.16520\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.14722#011validation-merror:0.16150\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.14631#011validation-merror:0.16020\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.14452#011validation-merror:0.15885\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.14262#011validation-merror:0.15690\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.14009#011validation-merror:0.15445\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.13919#011validation-merror:0.15385\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.13535#011validation-merror:0.14980\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.13311#011validation-merror:0.14770\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.13081#011validation-merror:0.14585\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.12935#011validation-merror:0.14420\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.12825#011validation-merror:0.14325\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.12615#011validation-merror:0.14100\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.12518#011validation-merror:0.13960\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.12415#011validation-merror:0.13890\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.12099#011validation-merror:0.13485\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.11899#011validation-merror:0.13305\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.11806#011validation-merror:0.13175\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.11681#011validation-merror:0.13050\u001b[0m\n", - "\n", - "2024-02-04 15:50:23 Uploading - Uploading generated training model\u001b[34m[96]#011train-merror:0.11598#011validation-merror:0.12980\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.11502#011validation-merror:0.12865\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.11488#011validation-merror:0.12835\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.11256#011validation-merror:0.12620\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.21867#011validation-merror:0.23400\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.21477#011validation-merror:0.23010\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.21047#011validation-merror:0.22455\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.20769#011validation-merror:0.22170\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.20254#011validation-merror:0.21670\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.19889#011validation-merror:0.21330\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.19612#011validation-merror:0.21035\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.19432#011validation-merror:0.20815\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.18804#011validation-merror:0.20185\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.18431#011validation-merror:0.19805\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.18051#011validation-merror:0.19390\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.17619#011validation-merror:0.19000\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.17419#011validation-merror:0.18745\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.17288#011validation-merror:0.18590\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.17129#011validation-merror:0.18445\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.16491#011validation-merror:0.17750\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.16085#011validation-merror:0.17335\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.15854#011validation-merror:0.17080\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.15472#011validation-merror:0.16830\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.15298#011validation-merror:0.16705\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.15158#011validation-merror:0.16590\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.14944#011validation-merror:0.16430\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.14696#011validation-merror:0.16185\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.14485#011validation-merror:0.15990\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.14281#011validation-merror:0.15785\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.14082#011validation-merror:0.15545\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.14005#011validation-merror:0.15490\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.13581#011validation-merror:0.15075\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.13385#011validation-merror:0.14885\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.13242#011validation-merror:0.14735\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.12923#011validation-merror:0.14400\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.12895#011validation-merror:0.14380\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.12702#011validation-merror:0.14160\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.12629#011validation-merror:0.14090\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.12568#011validation-merror:0.14010\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.12215#011validation-merror:0.13690\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.11968#011validation-merror:0.13450\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.11878#011validation-merror:0.13360\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.11785#011validation-merror:0.13240\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.11631#011validation-merror:0.13090\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.11509#011validation-merror:0.12975\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.11213#011validation-merror:0.12605\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.11039#011validation-merror:0.12445\u001b[0m\n", "\n", - "2024-02-04 15:50:39 Completed - Training job completed\n", + "2024-02-06 08:11:41 Uploading - Uploading generated training model\n", + "2024-02-06 08:11:41 Completed - Training job completed\n", "Training seconds: 303\n", "Billable seconds: 303\n" ] @@ -1854,7 +1820,7 @@ { "data": { "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz'" + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-06-08-05-13-165/output/model.tar.gz'" ] }, "execution_count": 29, @@ -1942,7 +1908,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 31, @@ -1975,25 +1941,114 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-04 15:51:09,295 [info] Starting remote function deploy\n", - "2024-02-04 15:51:09 (info) Deploying function\n", - "2024-02-04 15:51:09 (info) Building\n", - "2024-02-04 15:51:10 (info) Staging files and preparing base images\n", - "2024-02-04 15:51:10 (info) Building processor image\n", - "2024-02-04 15:52:46 (info) Build complete\n", - "2024-02-04 15:52:46 (info) Function deploy complete\n", - "> 2024-02-04 15:52:51,661 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-admin-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" + "> 2024-02-06 08:13:22,438 [info] Starting remote function deploy\n", + "2024-02-06 08:13:22 (info) Deploying function\n", + "2024-02-06 08:13:22 (info) Building\n", + "2024-02-06 08:13:23 (info) Staging files and preparing base images\n", + "2024-02-06 08:13:23 (info) Building processor image\n", + "2024-02-06 08:15:18 (info) Build complete\n", + "Failed to deploy. Details:\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", + " self.load()\n", + " File \"/opt/nuclio/serving.py\", line 21, in load\n", + " model_file, extra_data = self.get_model(\".tar.gz\")\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", + " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", + " obj.download(temp_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", + " self._store.download(self._path, target_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", + " data = self.get(key)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", + " return obj.get()[\"Body\"].read()\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", + " response = action(self, *args, **kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", + " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", + " return self._make_api_call(operation_name, kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", + " raise error_class(parsed_response, operation_name)\n", + "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", + " [worker_id=0]\n", + "Exception raised while running init_context [worker_id=0]\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", + " self.load()\n", + " File \"/opt/nuclio/serving.py\", line 21, in load\n", + " model_file, extra_data = self.get_model(\".tar.gz\")\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", + " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", + " obj.download(temp_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", + " self._store.download(self._path, target_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", + " data = self.get(key)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", + " return obj.get()[\"Body\"].read()\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", + " response = action(self, *args, **kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", + " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", + " return self._make_api_call(operation_name, kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", + " raise error_class(parsed_response, operation_name)\n", + "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 480, in \n", + " run_wrapper()\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 468, in run_wrapper\n", + " loop.run_until_complete(wrapper_instance.initialize())\n", + " File \"/opt/conda/lib/python3.9/asyncio/base_events.py\", line 647, in run_until_complete\n", + " return future.result()\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 165, in initialize\n", + " await self._initialize_context()\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 188, in _initialize_context\n", + " init_context_result = getattr(self._entrypoint_module, 'init_context')(self._context)\n", + " File \"/opt/nuclio/serving.py\", line 116, in init_context\n", + " nuclio_init_hook(context, globals(), 'serving_v2')\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/nuclio.py\", line 34, in nuclio_init_hook\n", + " v2_serving_init(context, data)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 349, in v2_serving_init\n", + " serving_handler = server.init_object(namespace or get_caller_globals())\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 192, in init_object\n", + " self.graph.init_object(self.context, namespace, self.load_mode, reset=True)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 917, in init_object\n", + " step.init_object(context, namespace, mode, reset=reset)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 444, in init_object\n", + " self._post_init(mode)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 502, in _post_init\n", + " self._object.post_init(mode)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 143, in post_init\n", + " self._load_and_update_state()\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 131, in _load_and_update_state\n", + " raise RuntimeError(f\"failed to load model {self.name}\") from exc\n", + "RuntimeError: failed to load model xgboost-model\n", + "> 2024-02-06 08:15:44,441 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" ] }, { - "data": { - "text/plain": [ - "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-admin-serving'})" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" + "ename": "RunError", + "evalue": "Function serving deployment failed", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:586\u001b[0m, in \u001b[0;36mRemoteRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_credentials_from_remote_build(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 584\u001b[0m \u001b[38;5;66;03m# when a function is deployed, we wait for it to be ready by default\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;66;03m# this also means that the function object will be updated with the function status\u001b[39;00m\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_function_deployment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[38;5;66;03m# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort\u001b[39;00m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# now, functions can be not exposed (using service type ClusterIP) and hence\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;66;03m# for BC we first try to populate the external invocation url, and then\u001b[39;00m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;66;03m# if not exists, take the internal invocation url\u001b[39;00m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mexternal_invocation_urls:\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:633\u001b[0m, in \u001b[0;36mRemoteRuntime._wait_for_function_deployment\u001b[0;34m(self, db, verbose)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 632\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNuclio function failed to deploy\u001b[39m\u001b[38;5;124m\"\u001b[39m, function_state\u001b[38;5;241m=\u001b[39mstate)\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RunError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFunction \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m deployment failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mRunError\u001b[0m: Function serving deployment failed" + ] } ], "source": [ diff --git a/serving-Copy1.ipynb b/serving-Copy1.ipynb new file mode 100644 index 0000000..975ad64 --- /dev/null +++ b/serving-Copy1.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1b3d7eb9-b601-47b4-a914-191e5bcf2764", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "41fa803e-cd2c-46ff-ba0c-6ff6d7b0b92c", + "metadata": {}, + "outputs": [], + "source": [ + "#import sys\n", + "#!{sys.executable} -m pip install --upgrade xgboost --quiet # upgrade boto to the latest vesion" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2c7bb858-9603-4c67-92c0-722b0cf24714", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-06 10:06:18,523 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"sagemaker-v2\", \n", + " user_project=True,\n", + " parameters={\n", + " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", + " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "74dab54c-6348-4a18-9db5-5d8074370fb0", + "metadata": {}, + "outputs": [], + "source": [ + "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3ee1d9bd-2652-4349-8df5-e231edb6acfa", + "metadata": {}, + "outputs": [], + "source": [ + "test_serving_function = project.set_function(\n", + " func=\"src/functions/serving.py\",\n", + " name=\"test-serving\",\n", + " kind=\"serving\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6a291b9c-0acc-4807-ab8e-4bec180a2bbf", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "preprocess\n", + "\n", + "preprocess\n", + "\n", + "\n", + "\n", + "_start->preprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "xgboost-model\n", + "\n", + "xgboost-model\n", + "\n", + "\n", + "\n", + "preprocess->xgboost-model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "postprocess\n", + "\n", + "postprocess\n", + "\n", + "\n", + "\n", + "xgboost-model->postprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set the topology and get the graph object:\n", + "graph = test_serving_function.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "# Add the steps:\n", + "graph.to(handler=\"preprocess\", name=\"preprocess\") \\\n", + " .to(\"XGBModelServer\",\n", + " name=\"xgboost-model\",\n", + " model_path=model_path) \\\n", + " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "\n", + "# Plot to graph:\n", + "test_serving_function.plot(rankdir='LR')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fb976023-5a2c-4dc8-b1b7-fd897446b747", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-06 10:06:19,590 [info] model xgboost-model was loaded\n", + "> 2024-02-06 10:06:19,926 [error] Pushing error to error stream: Expected key \"inputs\" in request body\n", + "Traceback (most recent call last):\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/flow.py\", line 212, in _do_and_recover\n", + " return await self._do(event)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/flow.py\", line 423, in _do\n", + " fn_result = await self._call(element)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/flow.py\", line 410, in _call\n", + " res = self._fn(element)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 242, in do_event\n", + " request = self._pre_event_processing_actions(event, event_body, op)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 225, in _pre_event_processing_actions\n", + " return self.validate(request, op)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 358, in validate\n", + " raise Exception('Expected key \"inputs\" in request body')\n", + "Exception: Expected key \"inputs\" in request body\n", + "\n" + ] + } + ], + "source": [ + "server = test_serving_function.to_mock_server()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "35e98782-129d-4ffb-b27e-d580589d6106", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "data = pd.read_csv(\n", + " \"financial_transactions_mini.csv\",\n", + " parse_dates=[\"timestamp\"],\n", + " infer_datetime_format=True,\n", + " dtype={\"transaction_category\": \"string\"},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cf821aae-83e4-4cc7-ba4a-b3038f7fd954", + "metadata": {}, + "outputs": [], + "source": [ + "data['transaction_id'] = data.reset_index().index" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d9b9df7b-3fce-4e2b-b739-2a845ae1df30", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'receiver_id': 4518551904499919,\n", + " 'sender_id': 4333582346477646,\n", + " 'amount': 833.26,\n", + " 'timestamp': Timestamp('2021-03-10 19:57:42'),\n", + " 'transaction_id': 0}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first_event_data = data.drop('transaction_category',axis=1)[:1].to_dict('records')\n", + "first_event_data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dff40293-9d50-400c-9a1b-62a7e610e176", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-06 10:06:19,928 [error] run error, Traceback (most recent call last):\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py\", line 280, in run\n", + " response = self.graph.run(event, **(extra_args or {}))\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 1147, in run\n", + " return resp.await_result()\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/sources.py\", line 67, in await_result\n", + " raise copy.copy(result)\n", + "Exception: Expected key \"inputs\" in request body\n", + "\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "failed (400): Exception: Expected key \"inputs\" in request body", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfirst_event_data\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py:250\u001b[0m, in \u001b[0;36mGraphServer.test\u001b[0;34m(self, path, body, method, headers, content_type, silent, get_body, event_id, trigger, offset, time)\u001b[0m\n\u001b[1;32m 248\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun(event, get_body\u001b[38;5;241m=\u001b[39mget_body)\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(resp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus_code\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m resp\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m300\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m silent:\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfailed (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", + "\u001b[0;31mRuntimeError\u001b[0m: failed (400): Exception: Expected key \"inputs\" in request body" + ] + } + ], + "source": [ + "response = server.test(body=first_event_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef10a992-7fce-424f-8733-f1eb190f7c42", + "metadata": {}, + "outputs": [], + "source": [ + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "381d3223-4ff0-454f-b0b7-1ed9589faca5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smdemo", + "language": "python", + "name": "smdemo" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/serving.ipynb b/serving.ipynb new file mode 100644 index 0000000..2eb5e98 --- /dev/null +++ b/serving.ipynb @@ -0,0 +1,955 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1b3d7eb9-b601-47b4-a914-191e5bcf2764", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "41fa803e-cd2c-46ff-ba0c-6ff6d7b0b92c", + "metadata": {}, + "outputs": [], + "source": [ + "#import sys\n", + "#!{sys.executable} -m pip install --upgrade xgboost --quiet # upgrade boto to the latest vesion" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2c7bb858-9603-4c67-92c0-722b0cf24714", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-06 08:34:06,016 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"sagemaker-v2\", \n", + " user_project=True,\n", + " parameters={\n", + " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", + " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "74dab54c-6348-4a18-9db5-5d8074370fb0", + "metadata": {}, + "outputs": [], + "source": [ + "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3ee1d9bd-2652-4349-8df5-e231edb6acfa", + "metadata": {}, + "outputs": [], + "source": [ + "test_serving_function = project.set_function(\n", + " func=\"src/functions/serving.py\",\n", + " name=\"test-serving\",\n", + " kind=\"serving\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6a291b9c-0acc-4807-ab8e-4bec180a2bbf", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "preprocess\n", + "\n", + "preprocess\n", + "\n", + "\n", + "\n", + "_start->preprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set the topology and get the graph object:\n", + "graph = test_serving_function.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "# Add the steps:\n", + "graph.to(handler=\"preprocess\", name=\"preprocess\").respond()\n", + "\n", + " # .to(\"XGBModelServer\",\n", + " # name=\"xgboost-model\",\n", + " # model_path=model_path) \\\n", + " # .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "\n", + "# Plot to graph:\n", + "test_serving_function.plot(rankdir='LR')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fb976023-5a2c-4dc8-b1b7-fd897446b747", + "metadata": {}, + "outputs": [], + "source": [ + "server = test_serving_function.to_mock_server()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "35e98782-129d-4ffb-b27e-d580589d6106", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "data = pd.read_csv(\n", + " \"financial_transactions_mini.csv\",\n", + " parse_dates=[\"timestamp\"],\n", + " infer_datetime_format=True,\n", + " dtype={\"transaction_category\": \"string\"},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "90ae1e7b-8e46-418b-b057-76071a22a8c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_categoryreceiver_idsender_idamounttimestamp
0Uncategorized45185519044999194333582346477646833.262021-03-10 19:57:42
1Uncategorized45185519044999194642413144038776596.632021-02-11 17:53:32
2Uncategorized42745440229395224952665515556751176.762021-02-21 18:29:32
3Uncategorized45185519044999194457298962882528879.782021-04-09 16:14:19
4Uncategorized46018532461252204578126462896710742.252021-04-04 15:50:16
..................
99992Pension and insurances44050083552203244583355906735225205.432021-04-20 12:23:53
99993Pension and insurances43004167445113354949240916846171151.492021-03-24 19:30:18
99994Pension and insurances44050083552203244996896020767264188.282021-03-08 19:51:10
99995Pension and insurances42620471944990064017367486513464204.262021-02-14 23:25:07
99996Pension and insurances46275166741447044250420705087194207.922021-04-14 00:42:00
\n", + "

99997 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category receiver_id sender_id amount \\\n", + "0 Uncategorized 4518551904499919 4333582346477646 833.26 \n", + "1 Uncategorized 4518551904499919 4642413144038776 596.63 \n", + "2 Uncategorized 4274544022939522 4952665515556751 176.76 \n", + "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "4 Uncategorized 4601853246125220 4578126462896710 742.25 \n", + "... ... ... ... ... \n", + "99992 Pension and insurances 4405008355220324 4583355906735225 205.43 \n", + "99993 Pension and insurances 4300416744511335 4949240916846171 151.49 \n", + "99994 Pension and insurances 4405008355220324 4996896020767264 188.28 \n", + "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", + "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", + "\n", + " timestamp \n", + "0 2021-03-10 19:57:42 \n", + "1 2021-02-11 17:53:32 \n", + "2 2021-02-21 18:29:32 \n", + "3 2021-04-09 16:14:19 \n", + "4 2021-04-04 15:50:16 \n", + "... ... \n", + "99992 2021-04-20 12:23:53 \n", + "99993 2021-03-24 19:30:18 \n", + "99994 2021-03-08 19:51:10 \n", + "99995 2021-02-14 23:25:07 \n", + "99996 2021-04-14 00:42:00 \n", + "\n", + "[99997 rows x 5 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "cf821aae-83e4-4cc7-ba4a-b3038f7fd954", + "metadata": {}, + "outputs": [], + "source": [ + "data['transaction_id'] = data.reset_index().index" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "057e2627-588b-464e-aa67-5f9daf209d5c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "test_data = pd.read_csv(\n", + " \"test.csv\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "fda9453e-c423-4c39-9c93-1ce93e42c38a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'receiver_id': 4518551904499919,\n", + " 'sender_id': 4333582346477646,\n", + " 'amount': 833.26,\n", + " 'timestamp': Timestamp('2021-03-10 19:57:42'),\n", + " 'transaction_id': 0}]" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first_event_data = data.drop('transaction_category',axis=1)[:1].to_dict('records')\n", + "first_event_data" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "0ba6eef2-b37c-4db0-adc1-a93b9b9246ab", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun.feature_store as fstore\n", + "def get_realtime_transactions_aggregations():\n", + " # Create a feature vector that gets the average amount\n", + " vector = fstore.FeatureVector(\"aggregations-vector\", [\"aggregations.amount_avg_1d\"], with_indexes=True)\n", + " #get the categories list\n", + " unique_categories = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\",\"12\",\"13\",\"14\",\"15\",\"16\"]\n", + " # Use online feature service to get the latest average amount per category\n", + " with vector.get_online_feature_service() as online_feature_service:\n", + " resp = online_feature_service.get(\n", + " [{\"transaction_category\":cat} for cat in unique_categories]\n", + " )\n", + " return resp\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "84877799-a628-46db-baaf-970dfbd05a67", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'amount_avg_1d': 606.79, 'transaction_category': '0'},\n", + " {'amount_avg_1d': 17.0, 'transaction_category': '1'},\n", + " {'amount_avg_1d': 883.925, 'transaction_category': '2'},\n", + " {'amount_avg_1d': 128.59, 'transaction_category': '3'},\n", + " {'amount_avg_1d': 36.695, 'transaction_category': '4'},\n", + " {'amount_avg_1d': 115.35, 'transaction_category': '5'},\n", + " {'amount_avg_1d': 190.425, 'transaction_category': '6'},\n", + " {'amount_avg_1d': 35.76, 'transaction_category': '7'},\n", + " {'amount_avg_1d': 5400.805, 'transaction_category': '8'},\n", + " {'amount_avg_1d': 163.965, 'transaction_category': '9'},\n", + " {'amount_avg_1d': 123.33, 'transaction_category': '10'},\n", + " {'amount_avg_1d': 265.07500000000005, 'transaction_category': '11'},\n", + " {'amount_avg_1d': 14.575, 'transaction_category': '12'},\n", + " {'amount_avg_1d': 119.17, 'transaction_category': '13'},\n", + " {'amount_avg_1d': 674.905, 'transaction_category': '14'},\n", + " {'amount_avg_1d': 4964.49, 'transaction_category': '15'},\n", + " {'amount_avg_1d': 166.99, 'transaction_category': '16'}]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "resp = get_realtime_transactions_aggregations()\n", + "resp" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "627561cb-ed85-4e8e-8f1a-10c312724886", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_distances(resp, event):\n", + " for cat in resp:\n", + " transaction_category = cat['transaction_category'] \n", + " amount_avg = cat['amount_avg_1d']\n", + " event[0][\"dist_\" + transaction_category] = abs(amount_avg - event[0][\"amount\"])\n", + "\n", + " return event" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "7f0ca55e-6f74-4b4a-9abe-9cd95a8507ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'receiver_id': 4518551904499919,\n", + " 'sender_id': 4333582346477646,\n", + " 'amount': 833.26,\n", + " 'timestamp': Timestamp('2021-03-10 19:57:42'),\n", + " 'transaction_id': 0,\n", + " 'dist_0': 226.47000000000003,\n", + " 'dist_1': 816.26,\n", + " 'dist_2': 50.664999999999964,\n", + " 'dist_3': 704.67,\n", + " 'dist_4': 796.5649999999999,\n", + " 'dist_5': 717.91,\n", + " 'dist_6': 642.835,\n", + " 'dist_7': 797.5,\n", + " 'dist_8': 4567.545,\n", + " 'dist_9': 669.295,\n", + " 'dist_10': 709.93,\n", + " 'dist_11': 568.185,\n", + " 'dist_12': 818.685,\n", + " 'dist_13': 714.09,\n", + " 'dist_14': 158.35500000000002,\n", + " 'dist_15': 4131.23,\n", + " 'dist_16': 666.27}]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dist_event = calculate_distances(resp,first_event_data)\n", + "dist_event" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "350e295d-1723-4548-9704-b9f4003e544f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2021" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dist_event[0]['timestamp'].year" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "0ce33455-ec2a-4e34-a2eb-86f154eece84", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_timestamp_to_components(event):\n", + " event[0][\"year\"] = event[0][\"timestamp\"].year\n", + " event[0][\"month\"] = event[0][\"timestamp\"].month\n", + " event[0][\"day\"] = event[0][\"timestamp\"].day\n", + " event[0][\"hour\"] = event[0][\"timestamp\"].hour\n", + " event[0][\"minute\"] = event[0][\"timestamp\"].minute\n", + " event[0][\"second\"] = event[0][\"timestamp\"].second\n", + " del event[0]['timestamp']\n", + "\n", + " return event\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "0471c226-00cf-40f0-9460-11ce4aaded9f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'receiver_id': 4518551904499919,\n", + " 'sender_id': 4333582346477646,\n", + " 'amount': 833.26,\n", + " 'transaction_id': 0,\n", + " 'dist_0': 226.47000000000003,\n", + " 'dist_1': 816.26,\n", + " 'dist_2': 50.664999999999964,\n", + " 'dist_3': 704.67,\n", + " 'dist_4': 796.5649999999999,\n", + " 'dist_5': 717.91,\n", + " 'dist_6': 642.835,\n", + " 'dist_7': 797.5,\n", + " 'dist_8': 4567.545,\n", + " 'dist_9': 669.295,\n", + " 'dist_10': 709.93,\n", + " 'dist_11': 568.185,\n", + " 'dist_12': 818.685,\n", + " 'dist_13': 714.09,\n", + " 'dist_14': 158.35500000000002,\n", + " 'dist_15': 4131.23,\n", + " 'dist_16': 666.27,\n", + " 'year': 2021,\n", + " 'month': 3,\n", + " 'day': 10,\n", + " 'hour': 19,\n", + " 'minute': 57,\n", + " 'second': 42}]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extended_event = convert_timestamp_to_components(dist_event)\n", + "extended_event" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "22d5fc7a-f41c-4d41-b8b7-ebd084599f67", + "metadata": {}, + "outputs": [], + "source": [ + "def move_to_end(ls, key):\n", + " \"\"\"Move an item to the end of the dictionary.\"\"\"\n", + " d = ls[0]\n", + " if key in d:\n", + " value = d.pop(key) # Remove the item and get its value\n", + " d[key] = value # Reinsert the item, which moves it to the end\n", + " ls[0] = d\n", + " return ls" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "e1bba3af-ef47-45f7-987d-c6f57c819922", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'receiver_id': 4518551904499919,\n", + " 'sender_id': 4333582346477646,\n", + " 'amount': 833.26,\n", + " 'dist_0': 226.47000000000003,\n", + " 'dist_1': 816.26,\n", + " 'dist_2': 50.664999999999964,\n", + " 'dist_3': 704.67,\n", + " 'dist_4': 796.5649999999999,\n", + " 'dist_5': 717.91,\n", + " 'dist_6': 642.835,\n", + " 'dist_7': 797.5,\n", + " 'dist_8': 4567.545,\n", + " 'dist_9': 669.295,\n", + " 'dist_10': 709.93,\n", + " 'dist_11': 568.185,\n", + " 'dist_12': 818.685,\n", + " 'dist_13': 714.09,\n", + " 'dist_14': 158.35500000000002,\n", + " 'dist_15': 4131.23,\n", + " 'dist_16': 666.27,\n", + " 'year': 2021,\n", + " 'month': 3,\n", + " 'day': 10,\n", + " 'hour': 19,\n", + " 'minute': 57,\n", + " 'second': 42,\n", + " 'transaction_id': 0}]" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "restructured_event = move_to_end(extended_event,'transaction_id')\n", + "restructured_event" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "05e84a7d-8c43-4c0f-8f08-8b10ac3ce624", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[4518551904499919,\n", + " 4333582346477646,\n", + " 833.26,\n", + " 226.47000000000003,\n", + " 816.26,\n", + " 50.664999999999964,\n", + " 704.67,\n", + " 796.5649999999999,\n", + " 717.91,\n", + " 642.835,\n", + " 797.5,\n", + " 4567.545,\n", + " 669.295,\n", + " 709.93,\n", + " 568.185,\n", + " 818.685,\n", + " 714.09,\n", + " 158.35500000000002,\n", + " 4131.23,\n", + " 666.27,\n", + " 2021,\n", + " 3,\n", + " 10,\n", + " 19,\n", + " 57,\n", + " 42,\n", + " 0]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "values_list = list(restructured_event[0].values())\n", + "values_list" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "f32ae384-7be1-4842-9754-695fb0f3fb32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[4518551904499919,\n", + " 4333582346477646,\n", + " 833.26,\n", + " 226.47000000000003,\n", + " 816.26,\n", + " 50.664999999999964,\n", + " 704.67,\n", + " 796.5649999999999,\n", + " 717.91,\n", + " 642.835,\n", + " 797.5,\n", + " 4567.545,\n", + " 669.295,\n", + " 709.93,\n", + " 568.185,\n", + " 818.685,\n", + " 714.09,\n", + " 158.35500000000002,\n", + " 4131.23,\n", + " 666.27,\n", + " 2021,\n", + " 3,\n", + " 10,\n", + " 19,\n", + " 57,\n", + " 42,\n", + " 0]]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "return_list = [values_list]\n", + "return_list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e4367c70-00d8-4a1e-a042-9bba154a7f17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[4572835609402945.0,\n", + " 4036699444587678.5,\n", + " 1936.83,\n", + " 1417.285,\n", + " 1883.725,\n", + " 1838.2753142857143,\n", + " 1517.5099999999998,\n", + " 1917.2477777777776,\n", + " 1688.4639999999995,\n", + " 1418.1149999999975,\n", + " 1324.3757142857137,\n", + " 1506.1275,\n", + " 1081.8533333333314,\n", + " 1725.5830000000003,\n", + " 1936.83,\n", + " 1834.4686689419796,\n", + " 1900.4391666666663,\n", + " 1805.5211627906972,\n", + " 1844.352278481013,\n", + " 1884.5990322580644,\n", + " 5144.063333333338,\n", + " 1844.468,\n", + " 2021.0,\n", + " 2.0,\n", + " 24.0,\n", + " 2.0,\n", + " 13.0,\n", + " 10.0,\n", + " 44990.0]]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = test_data.drop('transaction_category',axis=1)[:1].values.tolist()\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f594383-8769-40c4-834d-4452ab5f58d0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "response = server.test(body=samp_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5ba986f-cc05-47a0-ae85-5461258e86d6", + "metadata": {}, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "# response = server.test(body=samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85c8f63c-98b9-4977-a699-dc189aba8a51", + "metadata": {}, + "outputs": [], + "source": [ + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d00c6d6-869d-45b0-a6f6-e516944299fb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d87d1c46-a0fe-4d65-b069-d641088158bf", + "metadata": {}, + "outputs": [], + "source": [ + "unique_categories = [ 'Uncategorized', 'Entertainment', 'Education',\n", + " 'Shopping', 'Personal Care', 'Health and Fitness',\n", + " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", + " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", + " 'Fees and Charges', 'Business Services', 'Personal Services',\n", + " 'Taxes', 'Gambling', 'Home',\n", + " 'Pension and insurances']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "989e0376-ac28-4859-af7d-986bcf9f4b1d", + "metadata": {}, + "outputs": [], + "source": [ + "len('unique_categories')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95bb69b4-319c-4561-937d-5ef2bf100b8d", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the list of features we will be using\n", + "features = ['aggregations.*']\n", + "\n", + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", + "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'aggregations-vector'\n", + "\n", + "# Define the feature vector using our Feature Store (fstore)\n", + "aggregations_fv = fstore.FeatureVector(fv_name, \n", + " features, \n", + " description='aggregation information')\n", + "\n", + "# Save the feature vector in the Feature Store\n", + "aggregations_fv.save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88a45f5e-942c-467e-ba83-722f214ddead", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun.feature_store as fstore\n", + "\n", + "# Create a feature vector that gets the average amount\n", + "vector = fstore.FeatureVector(\"transactions_vector\", [\"aggregations.amount_avg_1d\"], with_indexes=True)\n", + "unique_categories = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\",\"12\",\"13\",\"14\",\"15\",\"16\"]\n", + "# Use online feature service to get the latest average amount per category\n", + "with vector.get_online_feature_service() as online_feature_service:\n", + " resp = online_feature_service.get(\n", + " [{\"transaction_category\":cat} for cat in unique_categories]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "351feb39-9b3b-4dbe-9662-93c690a563fa", + "metadata": {}, + "outputs": [], + "source": [ + "print(resp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ca44f54-4f9b-45e5-a18b-43697c7ea6cd", + "metadata": {}, + "outputs": [], + "source": [ + "for cat in resp:\n", + " transaction_category = cat['transaction_category']\n", + " amount_avg = cat['amount_avg_1d']\n", + " data[\"dist_\" + transaction_category] = abs(amount_avg - data[\"amount\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smdemo", + "language": "python", + "name": "smdemo" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/functions/serving.py b/src/functions/serving.py index c600f36..a8735c2 100644 --- a/src/functions/serving.py +++ b/src/functions/serving.py @@ -75,39 +75,57 @@ def postprocess(inputs: dict) -> dict: inputs["confidences"] = confidences return inputs -# Function that preprocesses the inference data -def preprocess(event): - print("--------------------") - print(event) - +def get_realtime_transactions_aggregations(): # Create a feature vector that gets the average amount - vector = fstore.FeatureVector("transactions_vector", ["aggregations.amount_avg_1d"], with_indexes=True) + vector = fstore.FeatureVector("aggregations-vector", ["aggregations.amount_avg_1d"], with_indexes=True) + #get the categories list unique_categories = ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"] # Use online feature service to get the latest average amount per category with vector.get_online_feature_service() as online_feature_service: resp = online_feature_service.get( [{"transaction_category":cat} for cat in unique_categories] ) + return resp - print('---------') - print(resp) - +def calculate_distances(resp, event): for cat in resp: transaction_category = cat['transaction_category'] amount_avg = cat['amount_avg_1d'] event[0]["dist_" + transaction_category] = abs(amount_avg - event[0]["amount"]) - print(event) - # # convert timestamp to components - # event["year"] = event["timestamp"].dt.year - # event["month"] = event["timestamp"].dt.month - # event["day"] = event["timestamp"].dt.day - # event["hour"] = event["timestamp"].dt.hour - # event["minute"] = event["timestamp"].dt.minute - # event["second"] = event["timestamp"].dt.second - - # del data["timestamp"] - # del data["transaction_category"] - event_list = list(list(event[0].values())) - - return event_list \ No newline at end of file + return event + +def convert_timestamp_to_components(event): + event[0]["year"] = event[0]["timestamp"].year + event[0]["month"] = event[0]["timestamp"].month + event[0]["day"] = event[0]["timestamp"].day + event[0]["hour"] = event[0]["timestamp"].hour + event[0]["minute"] = event[0]["timestamp"].minute + event[0]["second"] = event[0]["timestamp"].second + del event[0]['timestamp'] + + return event + +def move_to_end(ls, key): + """Move an item to the end of the dictionary.""" + d = ls[0] + if key in d: + value = d.pop(key) # Remove the item and get its value + d[key] = value # Reinsert the item, which moves it to the end + ls[0] = d + return ls + + + + +# Function that preprocesses the inference data +def preprocess(event): + resp = get_realtime_transactions_aggregations() + dist_event = calculate_distances(resp, event) + converted_event = convert_timestamp_to_components(event) + restructured_event = move_to_end(converted_event,'transaction_id') + values_list = list(restructured_event[0].values()) + return_list = [values_list] + return_list + return return_list + \ No newline at end of file From c07e1ddd20eaf24d3e8dc6229b6c7bfd7baa3e3f Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Wed, 7 Feb 2024 13:18:53 +0000 Subject: [PATCH 04/16] update usage of feature store --- financial_payment_classification_v3.ipynb | 1364 +++++++++++++++++++++ 1 file changed, 1364 insertions(+) create mode 100644 financial_payment_classification_v3.ipynb diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb new file mode 100644 index 0000000..555c0b4 --- /dev/null +++ b/financial_payment_classification_v3.ipynb @@ -0,0 +1,1364 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01b5c703", + "metadata": {}, + "source": [ + "# SageMaker Payment Classification \n" + ] + }, + { + "cell_type": "markdown", + "id": "6498f087", + "metadata": {}, + "source": [ + "---\n", + "\n", + "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", + "\n", + "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "c2e49281", + "metadata": {}, + "source": [ + "\n", + "## Background \n", + "\n", + "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", + "\n", + "\n", + "## Notebook overview \n", + "\n", + "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", + "\n", + "## Dataset \n", + "\n", + "For this notebook we use a synthetic dataset. This dataset has the following features \n", + "\n", + "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", + "\n", + " 'Uncategorized', 'Entertainment', 'Education',\n", + " 'Shopping', 'Personal Care', 'Health and Fitness',\n", + " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", + " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", + " 'Fees and Charges', 'Business Services', 'Personal Services',\n", + " 'Taxes', 'Gambling', 'Home',\n", + " 'Pension and insurances'\n", + "\n", + "\n", + "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", + "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", + "* __amount__: the amount which is transferred.\n", + "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", + "\n", + "\n", + "### 1. Setup \n", + "\n", + "Before we start we need to update the sagemaker library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fff19d6b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# import sys\n", + "# !{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", + "# !{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", + "# !{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-07 12:50:31,736 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"sagemaker-v3\", \n", + " user_project=True,\n", + " parameters={\n", + " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", + " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b17a94d", + "metadata": {}, + "source": [ + "Now that we have the latest version we can import the libraries that we'll use in this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "42c5d6d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n" + ] + } + ], + "source": [ + "import boto3\n", + "import io\n", + "import sagemaker\n", + "import time\n", + "import os\n", + "from sklearn.metrics import classification_report\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime, timedelta" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", + "metadata": {}, + "outputs": [], + "source": [ + "sess = sagemaker.Session()\n", + "write_bucket = sess.default_bucket()\n", + "write_prefix = \"sagemaker-app-lab\"" + ] + }, + { + "cell_type": "markdown", + "id": "3af7c33d", + "metadata": {}, + "source": [ + "Let's set the session variables to ensure that SageMaker is configured correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c0e4db17", + "metadata": {}, + "outputs": [], + "source": [ + "region = sagemaker.Session().boto_region_name\n", + "sm_client = boto3.client(\"sagemaker\")\n", + "boto_session = boto3.Session(region_name=region)\n", + "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", + "role = sagemaker_role\n", + "bucket_prefix = \"payment-classification\"\n", + "s3_bucket = sagemaker_session.default_bucket()" + ] + }, + { + "cell_type": "markdown", + "id": "4fe6a975", + "metadata": {}, + "source": [ + "We define the factorize key which is used to map the '__transaction_category__' to numeric values" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "43946b9f", + "metadata": {}, + "outputs": [], + "source": [ + "factorize_key = {\n", + " \"Uncategorized\": 0,\n", + " \"Entertainment\": 1,\n", + " \"Education\": 2,\n", + " \"Shopping\": 3,\n", + " \"Personal Care\": 4,\n", + " \"Health and Fitness\": 5,\n", + " \"Food and Dining\": 6,\n", + " \"Gifts and Donations\": 7,\n", + " \"Investments\": 8,\n", + " \"Bills and Utilities\": 9,\n", + " \"Auto and Transport\": 10,\n", + " \"Travel\": 11,\n", + " \"Fees and Charges\": 12,\n", + " \"Business Services\": 13,\n", + " \"Personal Services\": 14,\n", + " \"Taxes\": 15,\n", + " \"Gambling\": 16,\n", + " \"Home\": 17,\n", + " \"Pension and insurances\": 18,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5e3dc3c4", + "metadata": {}, + "source": [ + "### 2. Data preparation \n", + "\n", + "We ingest the simulated data from the public SageMaker S3 training database:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5ff0d280", + "metadata": {}, + "outputs": [], + "source": [ + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " f\"sagemaker-example-files-prod-{region}\",\n", + " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", + " \"financial_transactions_mini.csv\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "08578d93", + "metadata": {}, + "source": [ + "Let's start by loading the dataset from our csv file into a Pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a477abd7", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\n", + " \"financial_transactions_mini.csv\",\n", + " parse_dates=[\"timestamp\"],\n", + " infer_datetime_format=True,\n", + " dtype={\"transaction_category\": \"string\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cf6be447", + "metadata": {}, + "source": [ + "The dataframe looks as follows:\n", + "\n", + "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", + "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", + "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", + "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", + "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", + "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", + "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", + "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", + "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", + "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", + "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", + "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" + ] + }, + { + "cell_type": "markdown", + "id": "b5492919", + "metadata": {}, + "source": [ + "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "metadata": {}, + "outputs": [], + "source": [ + "for key, val in factorize_key.items():\n", + " factorize_key[key] = str(val)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ea2ebdd5", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", + "metadata": {}, + "outputs": [], + "source": [ + "data['transaction_id']= data.reset_index().index " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_categoryreceiver_idsender_idamounttimestamptransaction_id
10604.601853e+154.274416e+15879.392021-01-01 15:07:52106.0
37804.274544e+154.366884e+15628.012021-01-01 16:33:53378.0
36804.601853e+154.161674e+1589.692021-01-01 18:17:29368.0
1704.518552e+154.619387e+15222.012021-01-01 18:33:1817.0
17804.274544e+154.456440e+15418.522021-01-01 19:33:31178.0
.....................
6993894.904096e+154.133603e+15124.082024-02-05 15:00:0069938.0
7059294.904096e+154.444087e+15188.662024-02-06 10:00:0070592.0
7037994.200241e+154.202495e+15139.272024-02-06 15:00:0070379.0
7046294.612985e+154.525455e+1512.492024-02-07 10:00:0070462.0
7167294.538817e+154.291294e+1557.032024-02-07 15:00:0071672.0
\n", + "

99997 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category receiver_id sender_id amount \\\n", + "106 0 4.601853e+15 4.274416e+15 879.39 \n", + "378 0 4.274544e+15 4.366884e+15 628.01 \n", + "368 0 4.601853e+15 4.161674e+15 89.69 \n", + "17 0 4.518552e+15 4.619387e+15 222.01 \n", + "178 0 4.274544e+15 4.456440e+15 418.52 \n", + "... ... ... ... ... \n", + "69938 9 4.904096e+15 4.133603e+15 124.08 \n", + "70592 9 4.904096e+15 4.444087e+15 188.66 \n", + "70379 9 4.200241e+15 4.202495e+15 139.27 \n", + "70462 9 4.612985e+15 4.525455e+15 12.49 \n", + "71672 9 4.538817e+15 4.291294e+15 57.03 \n", + "\n", + " timestamp transaction_id \n", + "106 2021-01-01 15:07:52 106.0 \n", + "378 2021-01-01 16:33:53 378.0 \n", + "368 2021-01-01 18:17:29 368.0 \n", + "17 2021-01-01 18:33:18 17.0 \n", + "178 2021-01-01 19:33:31 178.0 \n", + "... ... ... \n", + "69938 2024-02-05 15:00:00 69938.0 \n", + "70592 2024-02-06 10:00:00 70592.0 \n", + "70379 2024-02-06 15:00:00 70379.0 \n", + "70462 2024-02-07 10:00:00 70462.0 \n", + "71672 2024-02-07 15:00:00 71672.0 \n", + "\n", + "[99997 rows x 6 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", + "from utils import update_timestamps\n", + "data = update_timestamps(data)\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", + "metadata": { + "tags": [] + }, + "source": [ + "### 3. Create feature store \n", + "\n", + "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " + ] + }, + { + "cell_type": "markdown", + "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", + "metadata": {}, + "source": [ + "#### feature-group-payment-classification" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_category_distance(event):\n", + " event['distance'] = abs(event['amount']-event['amount_avg_1d'])\n", + " return event" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4101c303-2da3-431b-9375-9fa1747070af", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "DateExtractor\n", + "\n", + "DateExtractor\n", + "\n", + "\n", + "\n", + "_start->DateExtractor\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Aggregates\n", + "\n", + "Aggregates\n", + "\n", + "\n", + "\n", + "DateExtractor->Aggregates\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "calculate_category_distance\n", + "\n", + "calculate_category_distance\n", + "\n", + "\n", + "\n", + "Aggregates->calculate_category_distance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "DropFeatures\n", + "\n", + "DropFeatures\n", + "\n", + "\n", + "\n", + "calculate_category_distance->DropFeatures\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "parquet/parquet\n", + "\n", + "\n", + "parquet\n", + "\n", + "\n", + "\n", + "DropFeatures->parquet/parquet\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "nosql/nosql\n", + "\n", + "\n", + "nosql\n", + "\n", + "\n", + "\n", + "DropFeatures->nosql/nosql\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fstore\n", + "from mlrun.feature_store.steps import OneHotEncoder, MapValues, DateExtractor, DropFeatures\n", + "\n", + "# creating feature set\n", + "extended_transactions_set = fstore.FeatureSet(\"transactions-v3\",\n", + " entities=[fstore.Entity(\"transaction_id\")],\n", + " description=\"transactions feature set\")\n", + "\n", + "# setting up the graph\n", + "# setting up the graph\n", + "extended_transactions_set.graph \\\n", + " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp'))\n", + " \n", + "\n", + "\n", + "extended_transactions_set.add_aggregation(name='amount',\n", + " column='amount',\n", + " operations=['avg'],\n", + " windows=['1d'],\n", + " )\n", + "\n", + "extended_transactions_set.graph.to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates').to(DropFeatures(features=['timestamp']))\n", + "\n", + "\n", + "extended_transactions_set.set_targets()\n", + "\n", + "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-07 13:11:28,253 [warning] Overriding type of entity 'transaction_id' from 'str' to 'float'. This may result in errors or unusable data.\n" + ] + } + ], + "source": [ + "ingested_data = extended_transactions_set.ingest(data, overwrite=True)\n", + "ingested_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03ed9e67-34f1-44c6-9474-bb24e7561da6", + "metadata": {}, + "outputs": [], + "source": [ + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", + "\n", + "# create feature vector on top of aggreagations\n", + "# Define the list of features we will be using\n", + "features = ['transactions-v2.*']\n", + "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'aggreagations-vector'\n", + "\n", + "# Define the feature vector using our Feature Store (fstore)\n", + "aggregations_fv = fstore.FeatureVector(fv_name, \n", + " features, \n", + " description='stocks information')\n", + "\n", + "#label_feature = 'transactions-v2.transaction_category',\n", + "# Save the feature vector in the Feature Store\n", + "aggregations_fv.save()" + ] + }, + { + "cell_type": "markdown", + "id": "b5e4834e", + "metadata": {}, + "source": [ + "We update the values in the feature store with the real values of our data" + ] + }, + { + "cell_type": "markdown", + "id": "e2f6395f", + "metadata": {}, + "source": [ + "And display them after getting them from the feature store" + ] + }, + { + "cell_type": "markdown", + "id": "cf148985", + "metadata": {}, + "source": [ + "We use the feature store to calculate the distance between the average of every category and the current amount" + ] + }, + { + "cell_type": "markdown", + "id": "289eeca6", + "metadata": {}, + "source": [ + "### 4. Create model \n", + "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", + "\n", + "\n", + "\n", + "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb4bdd8d", + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", + "train_data, validation_data, test_data = np.split(\n", + " data.sample(frac=1, random_state=42), [int(0.7 * len(data)), int(0.9 * len(data))]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f81f65b9", + "metadata": {}, + "source": [ + "We save these sets to a file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f849a7a9", + "metadata": {}, + "outputs": [], + "source": [ + "train_data.to_csv(\"train.csv\", index=False, header=False)\n", + "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", + "test_data.to_csv(\"test.csv\", index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "de669936", + "metadata": {}, + "source": [ + "And upload these files to our s3 bucket" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1ca2543", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"train/train.csv\")\n", + ").upload_file(\"train.csv\")\n", + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", + ").upload_file(\"validation.csv\")\n", + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"test/test.csv\")\n", + ").upload_file(\"test.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "22de532f", + "metadata": {}, + "source": [ + "Get the XGBoost sagemaker image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a41b6a7d", + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" + ] + }, + { + "cell_type": "markdown", + "id": "66cae2a9", + "metadata": {}, + "source": [ + "Transform our data to a sagemaker input for training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e51c917a", + "metadata": {}, + "outputs": [], + "source": [ + "s3_input_train = sagemaker.inputs.TrainingInput(\n", + " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", + ")\n", + "s3_input_validation = sagemaker.inputs.TrainingInput(\n", + " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6f2985d8", + "metadata": {}, + "source": [ + "We define the XGBoost model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92c1fe8c", + "metadata": {}, + "outputs": [], + "source": [ + "xgb = sagemaker.estimator.Estimator(\n", + " container,\n", + " role,\n", + " instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", + " sagemaker_session=sagemaker_session,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ecafdfe8", + "metadata": {}, + "source": [ + "Set the parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "582adc6c", + "metadata": {}, + "outputs": [], + "source": [ + "xgb.set_hyperparameters(\n", + " max_depth=5,\n", + " eta=0.2,\n", + " gamma=4,\n", + " min_child_weight=6,\n", + " subsample=0.8,\n", + " objective=\"multi:softprob\",\n", + " num_class=19,\n", + " verbosity=0,\n", + " num_round=100,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b36463dd", + "metadata": {}, + "source": [ + "And train the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c24e06fc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" + ] + }, + { + "cell_type": "markdown", + "id": "8b716cd7", + "metadata": {}, + "source": [ + "### 5. Using the endpoint \n", + "\n", + "Deploy the model to an endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", + "metadata": {}, + "outputs": [], + "source": [ + "xgb.model_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78444d49-4ad3-49e4-a579-19b173facb26", + "metadata": {}, + "outputs": [], + "source": [ + "serving_function = project.get_function(\"serving\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", + "metadata": {}, + "outputs": [], + "source": [ + "# Set the topology and get the graph object:\n", + "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "# Add the steps:\n", + "graph.to(handler=\"preprocess\", name=\"preprocess\") \\\n", + " .to(\"XGBModelServer\",\n", + " name=\"xgboost-model\",\n", + " model_path=xgb.model_data) \\\n", + " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "\n", + "# Plot to graph:\n", + "serving_function.plot(rankdir='LR')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", + "metadata": {}, + "outputs": [], + "source": [ + "project.deploy_function(\"serving\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c858e3e9-9e43-4148-8015-6047565db456", + "metadata": {}, + "outputs": [], + "source": [ + "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", + "metadata": {}, + "outputs": [], + "source": [ + "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" + ] + }, + { + "cell_type": "markdown", + "id": "712f4d35", + "metadata": {}, + "source": [ + "### 6. Evaluate performance \n", + "\n", + "Run the model on our test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e863ea7-5804-4637-b677-390c305cabfe", + "metadata": {}, + "outputs": [], + "source": [ + "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" + ] + }, + { + "cell_type": "markdown", + "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", + "metadata": {}, + "source": [ + "Add the evaluation function to our project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca4f7e49", + "metadata": {}, + "outputs": [], + "source": [ + "evaluate_function = project.get_function(\"evaluate\")" + ] + }, + { + "cell_type": "markdown", + "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", + "metadata": {}, + "source": [ + "Run the evaluation job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", + "metadata": {}, + "outputs": [], + "source": [ + "evaluate_run = evaluate_function.run(\n", + " handler=\"evaluate\",\n", + " params={\n", + " \"model_path\": xgb.model_data,\n", + " \"model_name\": \"xgboost-model\",\n", + " \"test_set\": s3_data,\n", + " \"label_column\": \"transaction_category\",\n", + " \"factorize_key\": factorize_key,\n", + " },\n", + " returns=[\"classification_report: dataset\"])" + ] + }, + { + "cell_type": "markdown", + "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", + "metadata": {}, + "source": [ + "See the evaluation result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", + "metadata": {}, + "outputs": [], + "source": [ + "evaluate_run.artifact(\"classification_report\").as_df()" + ] + }, + { + "cell_type": "markdown", + "id": "98d0b67e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You should see results similar to this:\n", + "\n", + "```\n", + " precision recall f1-score support\n", + "\n", + " Uncategorized 1.00 0.92 0.96 51\n", + " Entertainment 0.81 0.89 0.85 1486\n", + " Education 1.00 0.94 0.97 80\n", + " Shopping 0.86 0.94 0.90 3441\n", + " Personal Care 1.00 0.98 0.99 132\n", + " Health and Fitness 0.99 0.89 0.94 443\n", + " Food and Dining 0.99 0.82 0.90 918\n", + " Gifts and Donations 1.00 0.95 0.97 275\n", + " Investments 0.99 0.97 0.98 88\n", + " Bills and Utilities 1.00 0.99 1.00 332\n", + " Auto and Transport 0.94 0.84 0.88 1967\n", + " Travel 0.96 0.84 0.90 120\n", + " Fees and Charges 1.00 0.94 0.97 106\n", + " Business Services 1.00 0.99 1.00 146\n", + " Personal Services 1.00 0.96 0.98 75\n", + " Taxes 0.98 0.94 0.96 47\n", + " Gambling 1.00 1.00 1.00 15\n", + " Home 0.98 0.89 0.93 168\n", + "Pension and insurances 0.99 1.00 1.00 110\n", + "\n", + " accuracy 0.90 10000\n", + " macro avg 0.97 0.93 0.95 10000\n", + " weighted avg 0.91 0.90 0.90 10000\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "49fdc82d", + "metadata": {}, + "source": [ + "### 7. Clean up \n", + "\n", + "Remove the feature group and endpoint to clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f79b1164", + "metadata": {}, + "outputs": [], + "source": [ + "#feature_group.delete()\n", + "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e04b6fa6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Notebook CI Test Results\n", + "\n", + "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", + "\n", + "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "smdemo", + "language": "python", + "name": "smdemo" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From d42c6e9e6f37455e3a187155a0c80708d4c73614 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Wed, 7 Feb 2024 17:19:55 +0000 Subject: [PATCH 05/16] updating graph --- financial_payment_classification_v3.ipynb | 1378 +++++++++++++++++++-- 1 file changed, 1246 insertions(+), 132 deletions(-) diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb index 555c0b4..707899f 100644 --- a/financial_payment_classification_v3.ipynb +++ b/financial_payment_classification_v3.ipynb @@ -108,7 +108,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-07 12:50:31,736 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" + "> 2024-02-07 16:34:18,167 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" ] } ], @@ -337,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + "data[\"transaction_category_key\"] = data[\"transaction_category\"].replace(factorize_key)" ] }, { @@ -485,53 +485,59 @@ " sender_id\n", " amount\n", " timestamp\n", + " transaction_category_key\n", " transaction_id\n", " \n", " \n", " \n", " \n", " 106\n", - " 0\n", + " Uncategorized\n", " 4.601853e+15\n", " 4.274416e+15\n", " 879.39\n", " 2021-01-01 15:07:52\n", + " 0\n", " 106.0\n", " \n", " \n", " 378\n", - " 0\n", + " Uncategorized\n", " 4.274544e+15\n", " 4.366884e+15\n", " 628.01\n", " 2021-01-01 16:33:53\n", + " 0\n", " 378.0\n", " \n", " \n", " 368\n", - " 0\n", + " Uncategorized\n", " 4.601853e+15\n", " 4.161674e+15\n", " 89.69\n", " 2021-01-01 18:17:29\n", + " 0\n", " 368.0\n", " \n", " \n", " 17\n", - " 0\n", + " Uncategorized\n", " 4.518552e+15\n", " 4.619387e+15\n", " 222.01\n", " 2021-01-01 18:33:18\n", + " 0\n", " 17.0\n", " \n", " \n", " 178\n", - " 0\n", + " Uncategorized\n", " 4.274544e+15\n", " 4.456440e+15\n", " 418.52\n", " 2021-01-01 19:33:31\n", + " 0\n", " 178.0\n", " \n", " \n", @@ -542,85 +548,91 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 69938\n", - " 9\n", + " Bills and Utilities\n", " 4.904096e+15\n", " 4.133603e+15\n", " 124.08\n", " 2024-02-05 15:00:00\n", + " 9\n", " 69938.0\n", " \n", " \n", " 70592\n", - " 9\n", + " Bills and Utilities\n", " 4.904096e+15\n", " 4.444087e+15\n", " 188.66\n", " 2024-02-06 10:00:00\n", + " 9\n", " 70592.0\n", " \n", " \n", " 70379\n", - " 9\n", + " Bills and Utilities\n", " 4.200241e+15\n", " 4.202495e+15\n", " 139.27\n", " 2024-02-06 15:00:00\n", + " 9\n", " 70379.0\n", " \n", " \n", " 70462\n", - " 9\n", + " Bills and Utilities\n", " 4.612985e+15\n", " 4.525455e+15\n", " 12.49\n", " 2024-02-07 10:00:00\n", + " 9\n", " 70462.0\n", " \n", " \n", " 71672\n", - " 9\n", + " Bills and Utilities\n", " 4.538817e+15\n", " 4.291294e+15\n", " 57.03\n", " 2024-02-07 15:00:00\n", + " 9\n", " 71672.0\n", " \n", " \n", "\n", - "

99997 rows × 6 columns

\n", + "

99997 rows × 7 columns

\n", "" ], "text/plain": [ " transaction_category receiver_id sender_id amount \\\n", - "106 0 4.601853e+15 4.274416e+15 879.39 \n", - "378 0 4.274544e+15 4.366884e+15 628.01 \n", - "368 0 4.601853e+15 4.161674e+15 89.69 \n", - "17 0 4.518552e+15 4.619387e+15 222.01 \n", - "178 0 4.274544e+15 4.456440e+15 418.52 \n", + "106 Uncategorized 4.601853e+15 4.274416e+15 879.39 \n", + "378 Uncategorized 4.274544e+15 4.366884e+15 628.01 \n", + "368 Uncategorized 4.601853e+15 4.161674e+15 89.69 \n", + "17 Uncategorized 4.518552e+15 4.619387e+15 222.01 \n", + "178 Uncategorized 4.274544e+15 4.456440e+15 418.52 \n", "... ... ... ... ... \n", - "69938 9 4.904096e+15 4.133603e+15 124.08 \n", - "70592 9 4.904096e+15 4.444087e+15 188.66 \n", - "70379 9 4.200241e+15 4.202495e+15 139.27 \n", - "70462 9 4.612985e+15 4.525455e+15 12.49 \n", - "71672 9 4.538817e+15 4.291294e+15 57.03 \n", + "69938 Bills and Utilities 4.904096e+15 4.133603e+15 124.08 \n", + "70592 Bills and Utilities 4.904096e+15 4.444087e+15 188.66 \n", + "70379 Bills and Utilities 4.200241e+15 4.202495e+15 139.27 \n", + "70462 Bills and Utilities 4.612985e+15 4.525455e+15 12.49 \n", + "71672 Bills and Utilities 4.538817e+15 4.291294e+15 57.03 \n", "\n", - " timestamp transaction_id \n", - "106 2021-01-01 15:07:52 106.0 \n", - "378 2021-01-01 16:33:53 378.0 \n", - "368 2021-01-01 18:17:29 368.0 \n", - "17 2021-01-01 18:33:18 17.0 \n", - "178 2021-01-01 19:33:31 178.0 \n", - "... ... ... \n", - "69938 2024-02-05 15:00:00 69938.0 \n", - "70592 2024-02-06 10:00:00 70592.0 \n", - "70379 2024-02-06 15:00:00 70379.0 \n", - "70462 2024-02-07 10:00:00 70462.0 \n", - "71672 2024-02-07 15:00:00 71672.0 \n", + " timestamp transaction_category_key transaction_id \n", + "106 2021-01-01 15:07:52 0 106.0 \n", + "378 2021-01-01 16:33:53 0 378.0 \n", + "368 2021-01-01 18:17:29 0 368.0 \n", + "17 2021-01-01 18:33:18 0 17.0 \n", + "178 2021-01-01 19:33:31 0 178.0 \n", + "... ... ... ... \n", + "69938 2024-02-05 15:00:00 9 69938.0 \n", + "70592 2024-02-06 10:00:00 9 70592.0 \n", + "70379 2024-02-06 15:00:00 9 70379.0 \n", + "70462 2024-02-07 10:00:00 9 70462.0 \n", + "71672 2024-02-07 15:00:00 9 71672.0 \n", "\n", - "[99997 rows x 6 columns]" + "[99997 rows x 7 columns]" ] }, "execution_count": 14, @@ -635,6 +647,200 @@ "data" ] }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3f47156f-64f1-40bd-801a-58136e2a25cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_categoryreceiver_idsender_idamounttimestamptransaction_category_keytransaction_id
106Uncategorized4.601853e+154.274416e+15879.392021-01-01 15:07:520106.0
378Uncategorized4.274544e+154.366884e+15628.012021-01-01 16:33:530378.0
368Uncategorized4.601853e+154.161674e+1589.692021-01-01 18:17:290368.0
17Uncategorized4.518552e+154.619387e+15222.012021-01-01 18:33:18017.0
178Uncategorized4.274544e+154.456440e+15418.522021-01-01 19:33:310178.0
........................
7192Entertainment4.957788e+154.353876e+1545.852021-03-20 19:45:0417192.0
2935Entertainment4.116469e+154.801157e+1557.442021-03-20 19:50:4912935.0
500Entertainment4.952788e+154.081148e+1512.372021-03-20 19:52:541500.0
1605Entertainment4.328906e+154.795968e+1566.622021-03-20 19:53:2411605.0
2480Entertainment4.903953e+154.042474e+1581.382021-03-20 19:56:4812480.0
\n", + "

9999 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category receiver_id sender_id amount \\\n", + "106 Uncategorized 4.601853e+15 4.274416e+15 879.39 \n", + "378 Uncategorized 4.274544e+15 4.366884e+15 628.01 \n", + "368 Uncategorized 4.601853e+15 4.161674e+15 89.69 \n", + "17 Uncategorized 4.518552e+15 4.619387e+15 222.01 \n", + "178 Uncategorized 4.274544e+15 4.456440e+15 418.52 \n", + "... ... ... ... ... \n", + "7192 Entertainment 4.957788e+15 4.353876e+15 45.85 \n", + "2935 Entertainment 4.116469e+15 4.801157e+15 57.44 \n", + "500 Entertainment 4.952788e+15 4.081148e+15 12.37 \n", + "1605 Entertainment 4.328906e+15 4.795968e+15 66.62 \n", + "2480 Entertainment 4.903953e+15 4.042474e+15 81.38 \n", + "\n", + " timestamp transaction_category_key transaction_id \n", + "106 2021-01-01 15:07:52 0 106.0 \n", + "378 2021-01-01 16:33:53 0 378.0 \n", + "368 2021-01-01 18:17:29 0 368.0 \n", + "17 2021-01-01 18:33:18 0 17.0 \n", + "178 2021-01-01 19:33:31 0 178.0 \n", + "... ... ... ... \n", + "7192 2021-03-20 19:45:04 1 7192.0 \n", + "2935 2021-03-20 19:50:49 1 2935.0 \n", + "500 2021-03-20 19:52:54 1 500.0 \n", + "1605 2021-03-20 19:53:24 1 1605.0 \n", + "2480 2021-03-20 19:56:48 1 2480.0 \n", + "\n", + "[9999 rows x 7 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "main_categories = list(factorize_key.keys())\n", + "part_categories = main_categories[:3] \n", + "part_data = data[data['transaction_category'].isin(part_categories)][:9999]\n", + "part_data" + ] + }, { "cell_type": "markdown", "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", @@ -657,19 +863,21 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", "metadata": {}, "outputs": [], "source": [ "def calculate_category_distance(event):\n", - " event['distance'] = abs(event['amount']-event['amount_avg_1d'])\n", + " category = event['transaction_category']\n", + " #event[category+'distance'] = abs(event['amount']-event[category+'_avg_1d'])\n", + " event['distance'] = abs(event['amount']/2)\n", " return event" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "id": "4101c303-2da3-431b-9375-9fa1747070af", "metadata": {}, "outputs": [ @@ -682,11 +890,11 @@ "\n", "\n", - "\n", + "\n", "\n", "mlrun-flow\n", - "\n", + "\n", "\n", "\n", "_start\n", @@ -705,76 +913,88 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", - "Aggregates\n", - "\n", - "Aggregates\n", + "OneHotEncoder\n", + "\n", + "OneHotEncoder\n", "\n", - "\n", + "\n", "\n", - "DateExtractor->Aggregates\n", - "\n", - "\n", + "DateExtractor->OneHotEncoder\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", + "Aggregates\n", + "\n", + "Aggregates\n", + "\n", + "\n", + "\n", + "OneHotEncoder->Aggregates\n", + "\n", + "\n", + "\n", + "\n", + "\n", "calculate_category_distance\n", - "\n", - "calculate_category_distance\n", + "\n", + "calculate_category_distance\n", "\n", "\n", - "\n", + "\n", "Aggregates->calculate_category_distance\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "DropFeatures\n", - "\n", - "DropFeatures\n", + "\n", + "DropFeatures\n", "\n", "\n", - "\n", + "\n", "calculate_category_distance->DropFeatures\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "parquet/parquet\n", - "\n", - "\n", - "parquet\n", + "\n", + "\n", + "parquet\n", "\n", "\n", - "\n", + "\n", "DropFeatures->parquet/parquet\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "nosql/nosql\n", - "\n", - "\n", - "nosql\n", + "\n", + "\n", + "nosql\n", "\n", "\n", - "\n", + "\n", "DropFeatures->nosql/nosql\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -783,25 +1003,39 @@ "import mlrun.feature_store as fstore\n", "from mlrun.feature_store.steps import OneHotEncoder, MapValues, DateExtractor, DropFeatures\n", "\n", + "# Define and add value mapping\n", + "main_categories = list(factorize_key.keys())\n", + "\n", + "main_categories = part_categories\n", + "\n", + "# One Hot Encode the newly defined mappings\n", + "one_hot_encoder_mapping = {'category': main_categories}\n", + "\n", "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions-v3\",\n", + "extended_transactions_set = fstore.FeatureSet(\"transactions-v13\",\n", " entities=[fstore.Entity(\"transaction_id\")],\n", " description=\"transactions feature set\")\n", "\n", "# setting up the graph\n", "# setting up the graph\n", "extended_transactions_set.graph \\\n", - " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp'))\n", - " \n", - "\n", + " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp')) \\\n", + " .to(OneHotEncoder(mapping=one_hot_encoder_mapping))\n", "\n", "extended_transactions_set.add_aggregation(name='amount',\n", " column='amount',\n", " operations=['avg'],\n", " windows=['1d'],\n", - " )\n", + " period='1h')\n", + "\n", + "# # Add the category aggregations over a 14 day window\n", + "# for category in main_categories:\n", + "# extended_transactions_set.add_aggregation(name=category,column=f'category_{category}',\n", + "# operations=['avg'], windows=['1d'])\n", "\n", - "extended_transactions_set.graph.to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates').to(DropFeatures(features=['timestamp']))\n", + "extended_transactions_set.graph \\\n", + " .to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates') \\\n", + " .to(DropFeatures(features=['timestamp']))\n", "\n", "\n", "extended_transactions_set.set_targets()\n", @@ -811,7 +1045,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", "metadata": {}, "outputs": [ @@ -819,42 +1053,300 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-07 13:11:28,253 [warning] Overriding type of entity 'transaction_id' from 'str' to 'float'. This may result in errors or unusable data.\n" + "> 2024-02-07 16:34:42,430 [warning] Overriding type of entity 'transaction_id' from 'str' to 'float'. This may result in errors or unusable data.\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amount_avg_1dreceiver_idsender_idamounttransaction_category_keytimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_id
106.0879.394.601853e+154.274416e+15879.39020211115752439.695
378.0628.014.274544e+154.366884e+15628.010202111163353314.005
368.089.694.601853e+154.161674e+1589.69020211118172944.845
17.0222.014.518552e+154.619387e+15222.010202111183318111.005
178.0418.524.274544e+154.456440e+15418.520202111193331209.260
.......................................
7192.045.854.957788e+154.353876e+1545.85120213201945422.925
2935.057.444.116469e+154.801157e+1557.441202132019504928.720
500.012.374.952788e+154.081148e+1512.37120213201952546.185
1605.066.624.328906e+154.795968e+1566.621202132019532433.310
2480.081.384.903953e+154.042474e+1581.381202132019564840.690
\n", + "

9999 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " amount_avg_1d receiver_id sender_id amount \\\n", + "transaction_id \n", + "106.0 879.39 4.601853e+15 4.274416e+15 879.39 \n", + "378.0 628.01 4.274544e+15 4.366884e+15 628.01 \n", + "368.0 89.69 4.601853e+15 4.161674e+15 89.69 \n", + "17.0 222.01 4.518552e+15 4.619387e+15 222.01 \n", + "178.0 418.52 4.274544e+15 4.456440e+15 418.52 \n", + "... ... ... ... ... \n", + "7192.0 45.85 4.957788e+15 4.353876e+15 45.85 \n", + "2935.0 57.44 4.116469e+15 4.801157e+15 57.44 \n", + "500.0 12.37 4.952788e+15 4.081148e+15 12.37 \n", + "1605.0 66.62 4.328906e+15 4.795968e+15 66.62 \n", + "2480.0 81.38 4.903953e+15 4.042474e+15 81.38 \n", + "\n", + " transaction_category_key timestamp_year timestamp_month \\\n", + "transaction_id \n", + "106.0 0 2021 1 \n", + "378.0 0 2021 1 \n", + "368.0 0 2021 1 \n", + "17.0 0 2021 1 \n", + "178.0 0 2021 1 \n", + "... ... ... ... \n", + "7192.0 1 2021 3 \n", + "2935.0 1 2021 3 \n", + "500.0 1 2021 3 \n", + "1605.0 1 2021 3 \n", + "2480.0 1 2021 3 \n", + "\n", + " timestamp_day timestamp_hour timestamp_minute \\\n", + "transaction_id \n", + "106.0 1 15 7 \n", + "378.0 1 16 33 \n", + "368.0 1 18 17 \n", + "17.0 1 18 33 \n", + "178.0 1 19 33 \n", + "... ... ... ... \n", + "7192.0 20 19 45 \n", + "2935.0 20 19 50 \n", + "500.0 20 19 52 \n", + "1605.0 20 19 53 \n", + "2480.0 20 19 56 \n", + "\n", + " timestamp_second distance \n", + "transaction_id \n", + "106.0 52 439.695 \n", + "378.0 53 314.005 \n", + "368.0 29 44.845 \n", + "17.0 18 111.005 \n", + "178.0 31 209.260 \n", + "... ... ... \n", + "7192.0 4 22.925 \n", + "2935.0 49 28.720 \n", + "500.0 54 6.185 \n", + "1605.0 24 33.310 \n", + "2480.0 48 40.690 \n", + "\n", + "[9999 rows x 12 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "ingested_data = extended_transactions_set.ingest(data, overwrite=True)\n", + "ingested_data = extended_transactions_set.ingest(part_data, overwrite=True)\n", "ingested_data" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "03ed9e67-34f1-44c6-9474-bb24e7561da6", - "metadata": {}, - "outputs": [], - "source": [ - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "# create feature vector on top of aggreagations\n", - "# Define the list of features we will be using\n", - "features = ['transactions-v2.*']\n", - "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'aggreagations-vector'\n", - "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "aggregations_fv = fstore.FeatureVector(fv_name, \n", - " features, \n", - " description='stocks information')\n", - "\n", - "#label_feature = 'transactions-v2.transaction_category',\n", - "# Save the feature vector in the Feature Store\n", - "aggregations_fv.save()" - ] - }, { "cell_type": "markdown", "id": "b5e4834e", @@ -894,9 +1386,276 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "id": "bb4bdd8d", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_category_keyamount_avg_1dreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
00879.394.601853e+154.274416e+15879.3920211115752439.695
10628.014.274544e+154.366884e+15628.01202111163353314.005
2089.694.601853e+154.161674e+1589.6920211118172944.845
30222.014.518552e+154.619387e+15222.01202111183318111.005
40418.524.274544e+154.456440e+15418.52202111193331209.260
.......................................
9994145.854.957788e+154.353876e+1545.8520213201945422.925
9995157.444.116469e+154.801157e+1557.44202132019504928.720
9996112.374.952788e+154.081148e+1512.3720213201952546.185
9997166.624.328906e+154.795968e+1566.62202132019532433.310
9998181.384.903953e+154.042474e+1581.38202132019564840.690
\n", + "

9999 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category_key amount_avg_1d receiver_id sender_id \\\n", + "0 0 879.39 4.601853e+15 4.274416e+15 \n", + "1 0 628.01 4.274544e+15 4.366884e+15 \n", + "2 0 89.69 4.601853e+15 4.161674e+15 \n", + "3 0 222.01 4.518552e+15 4.619387e+15 \n", + "4 0 418.52 4.274544e+15 4.456440e+15 \n", + "... ... ... ... ... \n", + "9994 1 45.85 4.957788e+15 4.353876e+15 \n", + "9995 1 57.44 4.116469e+15 4.801157e+15 \n", + "9996 1 12.37 4.952788e+15 4.081148e+15 \n", + "9997 1 66.62 4.328906e+15 4.795968e+15 \n", + "9998 1 81.38 4.903953e+15 4.042474e+15 \n", + "\n", + " amount timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", + "0 879.39 2021 1 1 15 \n", + "1 628.01 2021 1 1 16 \n", + "2 89.69 2021 1 1 18 \n", + "3 222.01 2021 1 1 18 \n", + "4 418.52 2021 1 1 19 \n", + "... ... ... ... ... ... \n", + "9994 45.85 2021 3 20 19 \n", + "9995 57.44 2021 3 20 19 \n", + "9996 12.37 2021 3 20 19 \n", + "9997 66.62 2021 3 20 19 \n", + "9998 81.38 2021 3 20 19 \n", + "\n", + " timestamp_minute timestamp_second distance \n", + "0 7 52 439.695 \n", + "1 33 53 314.005 \n", + "2 17 29 44.845 \n", + "3 33 18 111.005 \n", + "4 33 31 209.260 \n", + "... ... ... ... \n", + "9994 45 4 22.925 \n", + "9995 50 49 28.720 \n", + "9996 52 54 6.185 \n", + "9997 53 24 33.310 \n", + "9998 56 48 40.690 \n", + "\n", + "[9999 rows x 12 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = ingested_data.reset_index(drop=True)\n", + "data = data[['transaction_category_key'] + [col for col in data.columns if col != 'transaction_category_key']]\n", + "\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "47512de3-60ac-49c7-ace8-031959527e86", + "metadata": {}, "outputs": [], "source": [ "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", @@ -915,7 +1674,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -935,10 +1694,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "e1ca2543", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:botocore.credentials:Found credentials in environment variables.\n", + "INFO:botocore.credentials:Found credentials in environment variables.\n", + "INFO:botocore.credentials:Found credentials in environment variables.\n" + ] + } + ], "source": [ "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", " os.path.join(bucket_prefix, \"train/train.csv\")\n", @@ -961,10 +1730,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "id": "a41b6a7d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n" + ] + } + ], "source": [ "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" ] @@ -979,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1002,7 +1779,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1027,7 +1804,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1055,12 +1832,156 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "id": "c24e06fc", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-07-17-07-42-480\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-02-07 17:07:42 Starting - Starting the training job......\n", + "2024-02-07 17:08:18 Starting - Preparing the instances for training......\n", + "2024-02-07 17:09:17 Downloading - Downloading input data...\n", + "2024-02-07 17:09:47 Downloading - Downloading the training image...\n", + "2024-02-07 17:10:38 Training - Training image download completed. Training in progress...\n", + "2024-02-07 17:11:08 Uploading - Uploading generated training model\u001b[34m[2024-02-07 17:10:55.079 ip-10-0-147-106.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Train matrix has 6999 rows and 11 columns\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Validation matrix has 2000 rows\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.166 ip-10-0-147-106.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.166 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.167 ip-10-0-147-106.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.168 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.168 ip-10-0-147-106.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-07:17:10:55:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.289 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-07 17:10:55.291 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.00686#011validation-merror:0.00700\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.00629#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.00629#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.00629#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.00486#011validation-merror:0.00650\u001b[0m\n", + "\n", + "2024-02-07 17:11:19 Completed - Training job completed\n", + "Training seconds: 122\n", + "Billable seconds: 122\n" + ] + } + ], "source": [ "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" ] @@ -1077,17 +1998,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-07-17-07-42-480/output/model.tar.gz'" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "xgb.model_data" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -1097,10 +2029,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "preprocess\n", + "\n", + "preprocess\n", + "\n", + "\n", + "\n", + "_start->preprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "xgboost-model\n", + "\n", + "xgboost-model\n", + "\n", + "\n", + "\n", + "preprocess->xgboost-model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "postprocess\n", + "\n", + "postprocess\n", + "\n", + "\n", + "\n", + "xgboost-model->postprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Set the topology and get the graph object:\n", "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", @@ -1118,10 +2118,124 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-07 17:13:49,933 [info] Starting remote function deploy\n", + "2024-02-07 17:13:50 (info) Deploying function\n", + "2024-02-07 17:13:50 (info) Building\n", + "2024-02-07 17:13:50 (info) Staging files and preparing base images\n", + "2024-02-07 17:13:50 (info) Building processor image\n", + "2024-02-07 17:15:36 (info) Build complete\n", + "Failed to deploy. Details:\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", + " self.load()\n", + " File \"/opt/nuclio/serving.py\", line 21, in load\n", + " model_file, extra_data = self.get_model(\".tar.gz\")\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", + " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", + " obj.download(temp_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", + " self._store.download(self._path, target_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", + " data = self.get(key)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", + " return obj.get()[\"Body\"].read()\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", + " response = action(self, *args, **kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", + " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", + " return self._make_api_call(operation_name, kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", + " raise error_class(parsed_response, operation_name)\n", + "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", + " [worker_id=0]\n", + "Exception raised while running init_context [worker_id=0]\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", + " self.load()\n", + " File \"/opt/nuclio/serving.py\", line 21, in load\n", + " model_file, extra_data = self.get_model(\".tar.gz\")\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", + " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", + " obj.download(temp_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", + " self._store.download(self._path, target_path)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", + " data = self.get(key)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", + " return obj.get()[\"Body\"].read()\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", + " response = action(self, *args, **kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", + " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", + " return self._make_api_call(operation_name, kwargs)\n", + " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", + " raise error_class(parsed_response, operation_name)\n", + "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 480, in \n", + " run_wrapper()\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 468, in run_wrapper\n", + " loop.run_until_complete(wrapper_instance.initialize())\n", + " File \"/opt/conda/lib/python3.9/asyncio/base_events.py\", line 647, in run_until_complete\n", + " return future.result()\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 165, in initialize\n", + " await self._initialize_context()\n", + " File \"/opt/nuclio/_nuclio_wrapper.py\", line 188, in _initialize_context\n", + " init_context_result = getattr(self._entrypoint_module, 'init_context')(self._context)\n", + " File \"/opt/nuclio/serving.py\", line 135, in init_context\n", + " nuclio_init_hook(context, globals(), 'serving_v2')\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/nuclio.py\", line 34, in nuclio_init_hook\n", + " v2_serving_init(context, data)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 349, in v2_serving_init\n", + " serving_handler = server.init_object(namespace or get_caller_globals())\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 192, in init_object\n", + " self.graph.init_object(self.context, namespace, self.load_mode, reset=True)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 917, in init_object\n", + " step.init_object(context, namespace, mode, reset=reset)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 444, in init_object\n", + " self._post_init(mode)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 502, in _post_init\n", + " self._object.post_init(mode)\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 143, in post_init\n", + " self._load_and_update_state()\n", + " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 131, in _load_and_update_state\n", + " raise RuntimeError(f\"failed to load model {self.name}\") from exc\n", + "RuntimeError: failed to load model xgboost-model\n", + "> 2024-02-07 17:15:51,622 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" + ] + }, + { + "ename": "RunError", + "evalue": "Function serving deployment failed", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[60], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:586\u001b[0m, in \u001b[0;36mRemoteRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_credentials_from_remote_build(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 584\u001b[0m \u001b[38;5;66;03m# when a function is deployed, we wait for it to be ready by default\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;66;03m# this also means that the function object will be updated with the function status\u001b[39;00m\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_function_deployment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[38;5;66;03m# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort\u001b[39;00m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# now, functions can be not exposed (using service type ClusterIP) and hence\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;66;03m# for BC we first try to populate the external invocation url, and then\u001b[39;00m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;66;03m# if not exists, take the internal invocation url\u001b[39;00m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mexternal_invocation_urls:\n", + "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:633\u001b[0m, in \u001b[0;36mRemoteRuntime._wait_for_function_deployment\u001b[0;34m(self, db, verbose)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 632\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNuclio function failed to deploy\u001b[39m\u001b[38;5;124m\"\u001b[39m, function_state\u001b[38;5;241m=\u001b[39mstate)\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RunError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFunction \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m deployment failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mRunError\u001b[0m: Function serving deployment failed" + ] + } + ], "source": [ "project.deploy_function(\"serving\")" ] From 791cc1fc5c322bbf467b3e50d34852b267ec451c Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Thu, 8 Feb 2024 11:41:49 +0000 Subject: [PATCH 06/16] update serving --- serving-Copy1.ipynb | 445 ++++++++++++++++++++++++++++++--------- src/functions/serving.py | 3 +- utils.py | 6 +- 3 files changed, 353 insertions(+), 101 deletions(-) diff --git a/serving-Copy1.ipynb b/serving-Copy1.ipynb index 975ad64..8fcf57d 100644 --- a/serving-Copy1.ipynb +++ b/serving-Copy1.ipynb @@ -33,13 +33,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-06 10:06:18,523 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" + "> 2024-02-08 10:46:56,324 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" ] } ], "source": [ "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v2\", \n", + " name=\"sagemaker-v3\", \n", " user_project=True,\n", " parameters={\n", " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz'" + "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-07-17-07-42-480/output/model.tar.gz'" ] }, { @@ -75,6 +75,162 @@ { "cell_type": "code", "execution_count": 6, + "id": "3da03265-204e-4600-8746-adc81f7ce3bf", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "data = pd.read_csv(\n", + " \"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b004f4de-cab6-47b6-b786-07b0601eac82", + "metadata": {}, + "outputs": [], + "source": [ + "data_cols = list(data.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5122852e-74b7-409b-b7c8-0941d22ba2d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['transaction_category_key',\n", + " 'amount_avg_1d',\n", + " 'receiver_id',\n", + " 'sender_id',\n", + " 'amount',\n", + " 'timestamp_year',\n", + " 'timestamp_month',\n", + " 'timestamp_day',\n", + " 'timestamp_hour',\n", + " 'timestamp_minute',\n", + " 'timestamp_second',\n", + " 'distance']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b81e491-e4d8-4253-8f6d-65b5a8a74640", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "41719e95-2d93-437f-8ace-57963e281d9d", + "metadata": {}, + "outputs": [], + "source": [ + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", + "\n", + "# create feature vector on top of aggreagations\n", + "# Define the list of features we will be using\n", + "features = [f\"transactions-v13.{name}\" for name in data_cols] \n", + "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'transactions-vector-v3'\n", + "\n", + "# Define the feature vector using our Feature Store (fstore)\n", + "transactions_fv = fstore.FeatureVector(fv_name, \n", + " features, \n", + " description='stocks information')\n", + "\n", + "#label_feature = 'transactions-v2.transaction_category',\n", + "# Save the feature vector in the Feature Store\n", + "transactions_fv.save()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "81b398c4-4aa0-4746-8a4c-e8c745027692", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun.feature_store as fs\n", + "# resp = fs.FeatureVector.get_offline_features(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector-v3:latest\")\n", + "# #Preview the dataset\n", + "# resp.to_dataframe().tail(5)\n", + "\n", + "svc = fs.FeatureVector.get_online_feature_service(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector-v3:latest\")\n", + "resp = svc.get([{\"transaction_id\": \"42\"}])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "04f288c0-11bc-45a1-ba23-c435e1696aa4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[None]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "resp" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "82817363-e449-4c0b-8527-1642b74c9406", + "metadata": {}, + "outputs": [], + "source": [ + "# # Set the topology and get the graph object:\n", + "# graph = test_serving_function.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "# # Add the steps:\n", + "# graph.to(\"XGBModelServer\",\n", + "# name=\"xgboost-model\",\n", + "# model_path=model_path) \\\n", + "# .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "\n", + "# # Plot to graph:\n", + "# test_serving_function.plot(rankdir='LR')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e07c6b0b-7c9a-4c44-bb08-ea8a865a043c", + "metadata": {}, + "outputs": [], + "source": [ + "#print(test_serving_function.spec.to_yaml())" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "id": "6a291b9c-0acc-4807-ab8e-4bec180a2bbf", "metadata": {}, "outputs": [ @@ -87,83 +243,74 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "mlrun-flow\n", - "\n", + "\n", "\n", "\n", "_start\n", - "\n", - "start\n", + "\n", + "start\n", "\n", - "\n", "\n", - "preprocess\n", - "\n", - "preprocess\n", + "\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "_start->preprocess\n", - "\n", - "\n", + "_start->\n", + "\n", + "\n", "\n", "\n", "\n", "xgboost-model\n", - "\n", - "xgboost-model\n", + "\n", + "xgboost-model\n", "\n", - "\n", + "\n", "\n", - "preprocess->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "postprocess\n", - "\n", - "postprocess\n", - "\n", - "\n", - "\n", - "xgboost-model->postprocess\n", - "\n", - "\n", + "->xgboost-model\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 6, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Set the topology and get the graph object:\n", - "graph = test_serving_function.set_topology(\"flow\", engine=\"async\")\n", "\n", - "# Add the steps:\n", - "graph.to(handler=\"preprocess\", name=\"preprocess\") \\\n", - " .to(\"XGBModelServer\",\n", - " name=\"xgboost-model\",\n", - " model_path=model_path) \\\n", - " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "# test_serving_function.set_topology(\n", + "# \"router\",\n", + "# mlrun.serving.routers.EnrichmentModelRouter(\n", + "# feature_vector_uri=\"store://feature-vectors/sagemaker-v3-admin/transactions-vector-v3:latest\",\n", + "# impute_policy={\"*\": \"$mean\"}),\n", + "# )\n", + "# # add the 3 trained models to the Ensemble\n", + "# for model in project.list_models('', tag='latest'):\n", + "# name = model.spec.db_key\n", + "# serving_fn.add_model(name, class_name=\"ClassifierModel\", model_path=model.uri)\n", + "\n", + "test_serving_function.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=model_path)\n", "\n", - "# Plot to graph:\n", - "test_serving_function.plot(rankdir='LR')" + "# Plot the ensemble configuration\n", + "test_serving_function.spec.graph.plot()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 30, "id": "fb976023-5a2c-4dc8-b1b7-fd897446b747", "metadata": {}, "outputs": [ @@ -171,23 +318,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-06 10:06:19,590 [info] model xgboost-model was loaded\n", - "> 2024-02-06 10:06:19,926 [error] Pushing error to error stream: Expected key \"inputs\" in request body\n", - "Traceback (most recent call last):\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/flow.py\", line 212, in _do_and_recover\n", - " return await self._do(event)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/flow.py\", line 423, in _do\n", - " fn_result = await self._call(element)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/flow.py\", line 410, in _call\n", - " res = self._fn(element)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 242, in do_event\n", - " request = self._pre_event_processing_actions(event, event_body, op)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 225, in _pre_event_processing_actions\n", - " return self.validate(request, op)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 358, in validate\n", - " raise Exception('Expected key \"inputs\" in request body')\n", - "Exception: Expected key \"inputs\" in request body\n", - "\n" + "> 2024-02-08 11:07:24,154 [info] model xgboost-model was loaded\n", + "> 2024-02-08 11:07:24,155 [info] Loaded ['xgboost-model']\n" ] } ], @@ -197,59 +329,160 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 39, "id": "35e98782-129d-4ffb-b27e-d580589d6106", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" + " \"test.csv\")" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "cf821aae-83e4-4cc7-ba4a-b3038f7fd954", + "execution_count": 40, + "id": "dfd10537-482a-404b-b765-b0db5cbb497b", "metadata": {}, "outputs": [], "source": [ - "data['transaction_id'] = data.reset_index().index" + "data = data.drop('transaction_category_key', axis=1)" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "d9b9df7b-3fce-4e2b-b739-2a845ae1df30", + "execution_count": 41, + "id": "8f67e627-cfbf-4b8b-a5b3-7b9e836f779a", + "metadata": {}, + "outputs": [], + "source": [ + "data = data[:1]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "90abdb9d-3140-45eb-9a4d-a83a74b95700", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amount_avg_1dreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
053.074.726537e+154.070479e+1553.07202121919374026.535
\n", + "
" + ], "text/plain": [ - "[{'receiver_id': 4518551904499919,\n", - " 'sender_id': 4333582346477646,\n", - " 'amount': 833.26,\n", - " 'timestamp': Timestamp('2021-03-10 19:57:42'),\n", - " 'transaction_id': 0}]" + " amount_avg_1d receiver_id sender_id amount timestamp_year \\\n", + "0 53.07 4.726537e+15 4.070479e+15 53.07 2021 \n", + "\n", + " timestamp_month timestamp_day timestamp_hour timestamp_minute \\\n", + "0 2 19 19 37 \n", + "\n", + " timestamp_second distance \n", + "0 40 26.535 " ] }, - "execution_count": 10, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "first_event_data = data.drop('transaction_category',axis=1)[:1].to_dict('records')\n", - "first_event_data" + "data" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 45, + "id": "09081ad4-e6d9-4ed8-9d47-ee0175bd291e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[53.07,\n", + " 4726536548206059.0,\n", + " 4070478627221885.0,\n", + " 53.07,\n", + " 2021.0,\n", + " 2.0,\n", + " 19.0,\n", + " 19.0,\n", + " 37.0,\n", + " 40.0,\n", + " 26.535]]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_ls = data.values.tolist()\n", + "data_ls" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "d9b9df7b-3fce-4e2b-b739-2a845ae1df30", + "metadata": {}, + "outputs": [], + "source": [ + "inputs_data = {'inputs': data_ls }" + ] + }, + { + "cell_type": "code", + "execution_count": 48, "id": "dff40293-9d50-400c-9a1b-62a7e610e176", "metadata": {}, "outputs": [ @@ -257,40 +490,58 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-06 10:06:19,928 [error] run error, Traceback (most recent call last):\n", + "> 2024-02-08 11:14:12,578 [error] run error, Traceback (most recent call last):\n", " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py\", line 280, in run\n", " response = self.graph.run(event, **(extra_args or {}))\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 1147, in run\n", - " return resp.await_result()\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/storey/sources.py\", line 67, in await_result\n", - " raise copy.copy(result)\n", - "Exception: Expected key \"inputs\" in request body\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 548, in run\n", + " raise exc\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 531, in run\n", + " return self._handler(event, *args, **kwargs)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 148, in do_event\n", + " event = self.preprocess(event)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 1174, in preprocess\n", + " event.body[\"inputs\"] = self._feature_service.get(\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/feature_store/feature_vector.py\", line 892, in get\n", + " raise mlrun.errors.MLRunInvalidArgumentError(\n", + "mlrun.errors.MLRunInvalidArgumentError: input list must be in the same size of the index_keys list\n", "\n" ] }, { "ename": "RuntimeError", - "evalue": "failed (400): Exception: Expected key \"inputs\" in request body", + "evalue": "failed (400): MLRunInvalidArgumentError: input list must be in the same size of the index_keys list", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfirst_event_data\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[48], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_data\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py:250\u001b[0m, in \u001b[0;36mGraphServer.test\u001b[0;34m(self, path, body, method, headers, content_type, silent, get_body, event_id, trigger, offset, time)\u001b[0m\n\u001b[1;32m 248\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun(event, get_body\u001b[38;5;241m=\u001b[39mget_body)\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(resp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus_code\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m resp\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m300\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m silent:\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfailed (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", - "\u001b[0;31mRuntimeError\u001b[0m: failed (400): Exception: Expected key \"inputs\" in request body" + "\u001b[0;31mRuntimeError\u001b[0m: failed (400): MLRunInvalidArgumentError: input list must be in the same size of the index_keys list" ] } ], "source": [ - "response = server.test(body=first_event_data)" + "response = server.test(body=inputs_data)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "ef10a992-7fce-424f-8733-f1eb190f7c42", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'response' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresponse\u001b[49m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'response' is not defined" + ] + } + ], "source": [ "print(response)" ] diff --git a/src/functions/serving.py b/src/functions/serving.py index a8735c2..6b6a3f7 100644 --- a/src/functions/serving.py +++ b/src/functions/serving.py @@ -127,5 +127,6 @@ def preprocess(event): values_list = list(restructured_event[0].values()) return_list = [values_list] return_list - return return_list + return_dict = {"inputs": return_list} + return return_dict \ No newline at end of file diff --git a/utils.py b/utils.py index 7455507..b5eb3a5 100644 --- a/utils.py +++ b/utils.py @@ -23,9 +23,9 @@ def update_timestamps(data): times.append(datetime(year, month, day, hour)) # Iterate over each transaction category - for i in range(len(data["transaction_category"].unique())): + for i in range(len(data["transaction_category_key"].unique())): # Extract all the rows for each category - category_data = data[data['transaction_category'] == str(i)] + category_data = data[data['transaction_category_key'] == str(i)] # Ensure timestamp is a datetime object pd.to_datetime(category_data.timestamp) @@ -40,7 +40,7 @@ def update_timestamps(data): # Update the initial dataframe to include those updated rows data.update(latest_rows) - data.sort_values(["transaction_category", "timestamp"], inplace=True) + data.sort_values(["transaction_category_key", "timestamp"], inplace=True) return data \ No newline at end of file From 74652dd53e0787231b7a6392427ee422e0749849 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Thu, 8 Feb 2024 17:46:13 +0000 Subject: [PATCH 07/16] updating feature store graph --- financial_payment_classification_v3.ipynb | 2199 ++++++++++----------- serving-Copy1.ipynb | 255 +-- src/functions/serving.py | 110 +- utils.py | 81 +- 4 files changed, 1278 insertions(+), 1367 deletions(-) diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb index 707899f..60e8dea 100644 --- a/financial_payment_classification_v3.ipynb +++ b/financial_payment_classification_v3.ipynb @@ -108,7 +108,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-07 16:34:18,167 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" + "> 2024-02-08 17:19:18,429 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" ] } ], @@ -311,154 +311,12 @@ "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" ] }, - { - "cell_type": "markdown", - "id": "b5492919", - "metadata": {}, - "source": [ - "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" - ] - }, { "cell_type": "code", "execution_count": 11, - "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "id": "8c15f00d-8f89-41ec-aa22-f23fc394d1b4", "metadata": {}, - "outputs": [], - "source": [ - "for key, val in factorize_key.items():\n", - " factorize_key[key] = str(val)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ea2ebdd5", - "metadata": {}, - "outputs": [], - "source": [ - "data[\"transaction_category_key\"] = data[\"transaction_category\"].replace(factorize_key)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", - "metadata": {}, - "outputs": [], - "source": [ - "data['transaction_id']= data.reset_index().index " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", - "metadata": { - "scrolled": true - }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" - ] - }, { "data": { "text/html": [ @@ -485,60 +343,48 @@ " sender_id\n", " amount\n", " timestamp\n", - " transaction_category_key\n", - " transaction_id\n", " \n", " \n", " \n", " \n", - " 106\n", + " 0\n", " Uncategorized\n", - " 4.601853e+15\n", - " 4.274416e+15\n", - " 879.39\n", - " 2021-01-01 15:07:52\n", - " 0\n", - " 106.0\n", + " 4518551904499919\n", + " 4333582346477646\n", + " 833.26\n", + " 2024-01-05 12:35:02.641158\n", " \n", " \n", - " 378\n", + " 1\n", " Uncategorized\n", - " 4.274544e+15\n", - " 4.366884e+15\n", - " 628.01\n", - " 2021-01-01 16:33:53\n", - " 0\n", - " 378.0\n", + " 4518551904499919\n", + " 4642413144038776\n", + " 596.63\n", + " 2023-12-09 10:30:52.641158\n", " \n", " \n", - " 368\n", + " 2\n", " Uncategorized\n", - " 4.601853e+15\n", - " 4.161674e+15\n", - " 89.69\n", - " 2021-01-01 18:17:29\n", - " 0\n", - " 368.0\n", + " 4274544022939522\n", + " 4952665515556751\n", + " 176.76\n", + " 2023-12-19 11:06:52.641158\n", " \n", " \n", - " 17\n", + " 3\n", " Uncategorized\n", - " 4.518552e+15\n", - " 4.619387e+15\n", - " 222.01\n", - " 2021-01-01 18:33:18\n", - " 0\n", - " 17.0\n", + " 4518551904499919\n", + " 4457298962882528\n", + " 879.78\n", + " 2024-02-04 08:51:39.641158\n", " \n", " \n", - " 178\n", + " 4\n", " Uncategorized\n", - " 4.274544e+15\n", - " 4.456440e+15\n", - " 418.52\n", - " 2021-01-01 19:33:31\n", - " 0\n", - " 178.0\n", + " 4601853246125220\n", + " 4578126462896710\n", + " 742.25\n", + " 2024-01-30 08:27:36.641158\n", " \n", " \n", " ...\n", @@ -547,298 +393,158 @@ " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", " \n", " \n", - " 69938\n", - " Bills and Utilities\n", - " 4.904096e+15\n", - " 4.133603e+15\n", - " 124.08\n", - " 2024-02-05 15:00:00\n", - " 9\n", - " 69938.0\n", + " 99992\n", + " Pension and insurances\n", + " 4405008355220324\n", + " 4583355906735225\n", + " 205.43\n", + " 2024-02-15 05:01:13.641158\n", " \n", " \n", - " 70592\n", - " Bills and Utilities\n", - " 4.904096e+15\n", - " 4.444087e+15\n", - " 188.66\n", - " 2024-02-06 10:00:00\n", - " 9\n", - " 70592.0\n", + " 99993\n", + " Pension and insurances\n", + " 4300416744511335\n", + " 4949240916846171\n", + " 151.49\n", + " 2024-01-19 12:07:38.641158\n", " \n", " \n", - " 70379\n", - " Bills and Utilities\n", - " 4.200241e+15\n", - " 4.202495e+15\n", - " 139.27\n", - " 2024-02-06 15:00:00\n", - " 9\n", - " 70379.0\n", + " 99994\n", + " Pension and insurances\n", + " 4405008355220324\n", + " 4996896020767264\n", + " 188.28\n", + " 2024-01-03 12:28:30.641158\n", " \n", " \n", - " 70462\n", - " Bills and Utilities\n", - " 4.612985e+15\n", - " 4.525455e+15\n", - " 12.49\n", - " 2024-02-07 10:00:00\n", - " 9\n", - " 70462.0\n", + " 99995\n", + " Pension and insurances\n", + " 4262047194499006\n", + " 4017367486513464\n", + " 204.26\n", + " 2023-12-12 16:02:27.641158\n", " \n", " \n", - " 71672\n", - " Bills and Utilities\n", - " 4.538817e+15\n", - " 4.291294e+15\n", - " 57.03\n", - " 2024-02-07 15:00:00\n", - " 9\n", - " 71672.0\n", + " 99996\n", + " Pension and insurances\n", + " 4627516674144704\n", + " 4250420705087194\n", + " 207.92\n", + " 2024-02-08 17:19:20.641158\n", " \n", " \n", "\n", - "

99997 rows × 7 columns

\n", + "

99997 rows × 5 columns

\n", "" ], "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "106 Uncategorized 4.601853e+15 4.274416e+15 879.39 \n", - "378 Uncategorized 4.274544e+15 4.366884e+15 628.01 \n", - "368 Uncategorized 4.601853e+15 4.161674e+15 89.69 \n", - "17 Uncategorized 4.518552e+15 4.619387e+15 222.01 \n", - "178 Uncategorized 4.274544e+15 4.456440e+15 418.52 \n", - "... ... ... ... ... \n", - "69938 Bills and Utilities 4.904096e+15 4.133603e+15 124.08 \n", - "70592 Bills and Utilities 4.904096e+15 4.444087e+15 188.66 \n", - "70379 Bills and Utilities 4.200241e+15 4.202495e+15 139.27 \n", - "70462 Bills and Utilities 4.612985e+15 4.525455e+15 12.49 \n", - "71672 Bills and Utilities 4.538817e+15 4.291294e+15 57.03 \n", + " transaction_category receiver_id sender_id amount \\\n", + "0 Uncategorized 4518551904499919 4333582346477646 833.26 \n", + "1 Uncategorized 4518551904499919 4642413144038776 596.63 \n", + "2 Uncategorized 4274544022939522 4952665515556751 176.76 \n", + "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "4 Uncategorized 4601853246125220 4578126462896710 742.25 \n", + "... ... ... ... ... \n", + "99992 Pension and insurances 4405008355220324 4583355906735225 205.43 \n", + "99993 Pension and insurances 4300416744511335 4949240916846171 151.49 \n", + "99994 Pension and insurances 4405008355220324 4996896020767264 188.28 \n", + "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", + "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", "\n", - " timestamp transaction_category_key transaction_id \n", - "106 2021-01-01 15:07:52 0 106.0 \n", - "378 2021-01-01 16:33:53 0 378.0 \n", - "368 2021-01-01 18:17:29 0 368.0 \n", - "17 2021-01-01 18:33:18 0 17.0 \n", - "178 2021-01-01 19:33:31 0 178.0 \n", - "... ... ... ... \n", - "69938 2024-02-05 15:00:00 9 69938.0 \n", - "70592 2024-02-06 10:00:00 9 70592.0 \n", - "70379 2024-02-06 15:00:00 9 70379.0 \n", - "70462 2024-02-07 10:00:00 9 70462.0 \n", - "71672 2024-02-07 15:00:00 9 71672.0 \n", + " timestamp \n", + "0 2024-01-05 12:35:02.641158 \n", + "1 2023-12-09 10:30:52.641158 \n", + "2 2023-12-19 11:06:52.641158 \n", + "3 2024-02-04 08:51:39.641158 \n", + "4 2024-01-30 08:27:36.641158 \n", + "... ... \n", + "99992 2024-02-15 05:01:13.641158 \n", + "99993 2024-01-19 12:07:38.641158 \n", + "99994 2024-01-03 12:28:30.641158 \n", + "99995 2023-12-12 16:02:27.641158 \n", + "99996 2024-02-08 17:19:20.641158 \n", "\n", - "[99997 rows x 7 columns]" + "[99997 rows x 5 columns]" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", "from utils import update_timestamps\n", - "data = update_timestamps(data)\n", + "data=update_timestamps(data)\n", "data" ] }, + { + "cell_type": "markdown", + "id": "b5492919", + "metadata": {}, + "source": [ + "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "metadata": {}, + "outputs": [], + "source": [ + "# for key, val in factorize_key.items():\n", + "# factorize_key[key] = str(val)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ea2ebdd5", + "metadata": {}, + "outputs": [], + "source": [ + "# data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", + "metadata": {}, + "outputs": [], + "source": [ + "data['transaction_id']= data.reset_index().index " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# # Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", + "# from utils import update_timestamps\n", + "# data = update_timestamps(data)\n", + "# data" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "id": "3f47156f-64f1-40bd-801a-58136e2a25cb", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamounttimestamptransaction_category_keytransaction_id
106Uncategorized4.601853e+154.274416e+15879.392021-01-01 15:07:520106.0
378Uncategorized4.274544e+154.366884e+15628.012021-01-01 16:33:530378.0
368Uncategorized4.601853e+154.161674e+1589.692021-01-01 18:17:290368.0
17Uncategorized4.518552e+154.619387e+15222.012021-01-01 18:33:18017.0
178Uncategorized4.274544e+154.456440e+15418.522021-01-01 19:33:310178.0
........................
7192Entertainment4.957788e+154.353876e+1545.852021-03-20 19:45:0417192.0
2935Entertainment4.116469e+154.801157e+1557.442021-03-20 19:50:4912935.0
500Entertainment4.952788e+154.081148e+1512.372021-03-20 19:52:541500.0
1605Entertainment4.328906e+154.795968e+1566.622021-03-20 19:53:2411605.0
2480Entertainment4.903953e+154.042474e+1581.382021-03-20 19:56:4812480.0
\n", - "

9999 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "106 Uncategorized 4.601853e+15 4.274416e+15 879.39 \n", - "378 Uncategorized 4.274544e+15 4.366884e+15 628.01 \n", - "368 Uncategorized 4.601853e+15 4.161674e+15 89.69 \n", - "17 Uncategorized 4.518552e+15 4.619387e+15 222.01 \n", - "178 Uncategorized 4.274544e+15 4.456440e+15 418.52 \n", - "... ... ... ... ... \n", - "7192 Entertainment 4.957788e+15 4.353876e+15 45.85 \n", - "2935 Entertainment 4.116469e+15 4.801157e+15 57.44 \n", - "500 Entertainment 4.952788e+15 4.081148e+15 12.37 \n", - "1605 Entertainment 4.328906e+15 4.795968e+15 66.62 \n", - "2480 Entertainment 4.903953e+15 4.042474e+15 81.38 \n", - "\n", - " timestamp transaction_category_key transaction_id \n", - "106 2021-01-01 15:07:52 0 106.0 \n", - "378 2021-01-01 16:33:53 0 378.0 \n", - "368 2021-01-01 18:17:29 0 368.0 \n", - "17 2021-01-01 18:33:18 0 17.0 \n", - "178 2021-01-01 19:33:31 0 178.0 \n", - "... ... ... ... \n", - "7192 2021-03-20 19:45:04 1 7192.0 \n", - "2935 2021-03-20 19:50:49 1 2935.0 \n", - "500 2021-03-20 19:52:54 1 500.0 \n", - "1605 2021-03-20 19:53:24 1 1605.0 \n", - "2480 2021-03-20 19:56:48 1 2480.0 \n", - "\n", - "[9999 rows x 7 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "main_categories = list(factorize_key.keys())\n", - "part_categories = main_categories[:3] \n", - "part_data = data[data['transaction_category'].isin(part_categories)][:9999]\n", - "part_data" + "# main_categories = list(factorize_key.keys())\n", + "# part_categories = main_categories[:3] \n", + "# part_data = data[data['transaction_category'].isin(part_categories)][:9999]\n", + "# part_data" ] }, { @@ -863,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", "metadata": {}, "outputs": [], @@ -877,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "4101c303-2da3-431b-9375-9fa1747070af", "metadata": {}, "outputs": [ @@ -991,10 +697,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1006,27 +712,20 @@ "# Define and add value mapping\n", "main_categories = list(factorize_key.keys())\n", "\n", - "main_categories = part_categories\n", + "#main_categories = part_categories\n", "\n", "# One Hot Encode the newly defined mappings\n", "one_hot_encoder_mapping = {'category': main_categories}\n", "\n", "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions-v13\",\n", + "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", " entities=[fstore.Entity(\"transaction_id\")],\n", " description=\"transactions feature set\")\n", - "\n", - "# setting up the graph\n", "# setting up the graph\n", "extended_transactions_set.graph \\\n", " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp')) \\\n", " .to(OneHotEncoder(mapping=one_hot_encoder_mapping))\n", - "\n", - "extended_transactions_set.add_aggregation(name='amount',\n", - " column='amount',\n", - " operations=['avg'],\n", - " windows=['1d'],\n", - " period='1h')\n", + "extended_transactions_set.add_aggregation(name='amount',column='amount',operations=['avg'],windows=['1d'],period='1h')\n", "\n", "# # Add the category aggregations over a 14 day window\n", "# for category in main_categories:\n", @@ -1045,17 +744,314 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", + "execution_count": 20, + "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-08 17:20:30,917 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amount_avg_1dtransaction_categoryreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_id
0833.26Uncategorized45185519044999194333582346477646833.2620241512352416.630
1596.63Uncategorized45185519044999194642413144038776596.632023129103052298.315
2176.76Uncategorized42745440229395224952665515556751176.76202312191165288.380
3879.78Uncategorized45185519044999194457298962882528879.7820242485139439.890
4742.25Uncategorized46018532461252204578126462896710742.25202413082736371.125
.......................................
99992205.43Pension and insurances44050083552203244583355906735225205.4320242155113102.715
99993151.49Pension and insurances43004167445113354949240916846171151.4920241191273875.745
99994188.28Pension and insurances44050083552203244996896020767264188.2820241312283094.140
99995204.26Pension and insurances42620471944990064017367486513464204.262023121216227102.130
99996207.92Pension and insurances46275166741447044250420705087194207.92202428171920103.960
\n", + "

99997 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " amount_avg_1d transaction_category receiver_id \\\n", + "transaction_id \n", + "0 833.26 Uncategorized 4518551904499919 \n", + "1 596.63 Uncategorized 4518551904499919 \n", + "2 176.76 Uncategorized 4274544022939522 \n", + "3 879.78 Uncategorized 4518551904499919 \n", + "4 742.25 Uncategorized 4601853246125220 \n", + "... ... ... ... \n", + "99992 205.43 Pension and insurances 4405008355220324 \n", + "99993 151.49 Pension and insurances 4300416744511335 \n", + "99994 188.28 Pension and insurances 4405008355220324 \n", + "99995 204.26 Pension and insurances 4262047194499006 \n", + "99996 207.92 Pension and insurances 4627516674144704 \n", + "\n", + " sender_id amount timestamp_year timestamp_month \\\n", + "transaction_id \n", + "0 4333582346477646 833.26 2024 1 \n", + "1 4642413144038776 596.63 2023 12 \n", + "2 4952665515556751 176.76 2023 12 \n", + "3 4457298962882528 879.78 2024 2 \n", + "4 4578126462896710 742.25 2024 1 \n", + "... ... ... ... ... \n", + "99992 4583355906735225 205.43 2024 2 \n", + "99993 4949240916846171 151.49 2024 1 \n", + "99994 4996896020767264 188.28 2024 1 \n", + "99995 4017367486513464 204.26 2023 12 \n", + "99996 4250420705087194 207.92 2024 2 \n", + "\n", + " timestamp_day timestamp_hour timestamp_minute \\\n", + "transaction_id \n", + "0 5 12 35 \n", + "1 9 10 30 \n", + "2 19 11 6 \n", + "3 4 8 51 \n", + "4 30 8 27 \n", + "... ... ... ... \n", + "99992 15 5 1 \n", + "99993 19 12 7 \n", + "99994 3 12 28 \n", + "99995 12 16 2 \n", + "99996 8 17 19 \n", + "\n", + " timestamp_second distance \n", + "transaction_id \n", + "0 2 416.630 \n", + "1 52 298.315 \n", + "2 52 88.380 \n", + "3 39 439.890 \n", + "4 36 371.125 \n", + "... ... ... \n", + "99992 13 102.715 \n", + "99993 38 75.745 \n", + "99994 30 94.140 \n", + "99995 27 102.130 \n", + "99996 20 103.960 \n", + "\n", + "[99997 rows x 12 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ingested_data = extended_transactions_set.ingest(data, overwrite=True)\n", + "ingested_data" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-07 16:34:42,430 [warning] Overriding type of entity 'transaction_id' from 'str' to 'float'. This may result in errors or unusable data.\n" - ] - }, { "data": { "text/html": [ @@ -1077,11 +1073,11 @@ " \n", " \n", " \n", + " transaction_category\n", " amount_avg_1d\n", " receiver_id\n", " sender_id\n", " amount\n", - " transaction_category_key\n", " timestamp_year\n", " timestamp_month\n", " timestamp_day\n", @@ -1108,79 +1104,79 @@ " \n", " \n", " \n", - " 106.0\n", - " 879.39\n", - " 4.601853e+15\n", - " 4.274416e+15\n", - " 879.39\n", - " 0\n", - " 2021\n", - " 1\n", + " 0\n", + " Uncategorized\n", + " 833.26\n", + " 4518551904499919\n", + " 4333582346477646\n", + " 833.26\n", + " 2024\n", " 1\n", - " 15\n", - " 7\n", - " 52\n", - " 439.695\n", + " 5\n", + " 12\n", + " 35\n", + " 2\n", + " 416.630\n", " \n", " \n", - " 378.0\n", - " 628.01\n", - " 4.274544e+15\n", - " 4.366884e+15\n", - " 628.01\n", - " 0\n", - " 2021\n", - " 1\n", - " 1\n", - " 16\n", - " 33\n", - " 53\n", - " 314.005\n", + " 1\n", + " Uncategorized\n", + " 596.63\n", + " 4518551904499919\n", + " 4642413144038776\n", + " 596.63\n", + " 2023\n", + " 12\n", + " 9\n", + " 10\n", + " 30\n", + " 52\n", + " 298.315\n", " \n", " \n", - " 368.0\n", - " 89.69\n", - " 4.601853e+15\n", - " 4.161674e+15\n", - " 89.69\n", - " 0\n", - " 2021\n", - " 1\n", - " 1\n", - " 18\n", - " 17\n", - " 29\n", - " 44.845\n", + " 2\n", + " Uncategorized\n", + " 176.76\n", + " 4274544022939522\n", + " 4952665515556751\n", + " 176.76\n", + " 2023\n", + " 12\n", + " 19\n", + " 11\n", + " 6\n", + " 52\n", + " 88.380\n", " \n", " \n", - " 17.0\n", - " 222.01\n", - " 4.518552e+15\n", - " 4.619387e+15\n", - " 222.01\n", - " 0\n", - " 2021\n", - " 1\n", - " 1\n", - " 18\n", - " 33\n", - " 18\n", - " 111.005\n", + " 3\n", + " Uncategorized\n", + " 879.78\n", + " 4518551904499919\n", + " 4457298962882528\n", + " 879.78\n", + " 2024\n", + " 2\n", + " 4\n", + " 8\n", + " 51\n", + " 39\n", + " 439.890\n", " \n", " \n", - " 178.0\n", - " 418.52\n", - " 4.274544e+15\n", - " 4.456440e+15\n", - " 418.52\n", - " 0\n", - " 2021\n", - " 1\n", + " 4\n", + " Uncategorized\n", + " 742.25\n", + " 4601853246125220\n", + " 4578126462896710\n", + " 742.25\n", + " 2024\n", " 1\n", - " 19\n", - " 33\n", - " 31\n", - " 209.260\n", + " 30\n", + " 8\n", + " 27\n", + " 36\n", + " 371.125\n", " \n", " \n", " ...\n", @@ -1198,196 +1194,221 @@ " ...\n", " \n", " \n", - " 7192.0\n", - " 45.85\n", - " 4.957788e+15\n", - " 4.353876e+15\n", - " 45.85\n", + " 99992\n", + " Pension and insurances\n", + " 205.43\n", + " 4405008355220324\n", + " 4583355906735225\n", + " 205.43\n", + " 2024\n", + " 2\n", + " 15\n", + " 5\n", " 1\n", - " 2021\n", - " 3\n", - " 20\n", - " 19\n", - " 45\n", - " 4\n", - " 22.925\n", + " 13\n", + " 102.715\n", " \n", " \n", - " 2935.0\n", - " 57.44\n", - " 4.116469e+15\n", - " 4.801157e+15\n", - " 57.44\n", + " 99993\n", + " Pension and insurances\n", + " 151.49\n", + " 4300416744511335\n", + " 4949240916846171\n", + " 151.49\n", + " 2024\n", " 1\n", - " 2021\n", - " 3\n", - " 20\n", " 19\n", - " 50\n", - " 49\n", - " 28.720\n", + " 12\n", + " 7\n", + " 38\n", + " 75.745\n", " \n", " \n", - " 500.0\n", - " 12.37\n", - " 4.952788e+15\n", - " 4.081148e+15\n", - " 12.37\n", + " 99994\n", + " Pension and insurances\n", + " 188.28\n", + " 4405008355220324\n", + " 4996896020767264\n", + " 188.28\n", + " 2024\n", " 1\n", - " 2021\n", " 3\n", - " 20\n", - " 19\n", - " 52\n", - " 54\n", - " 6.185\n", + " 12\n", + " 28\n", + " 30\n", + " 94.140\n", " \n", " \n", - " 1605.0\n", - " 66.62\n", - " 4.328906e+15\n", - " 4.795968e+15\n", - " 66.62\n", - " 1\n", - " 2021\n", - " 3\n", - " 20\n", - " 19\n", - " 53\n", - " 24\n", - " 33.310\n", + " 99995\n", + " Pension and insurances\n", + " 204.26\n", + " 4262047194499006\n", + " 4017367486513464\n", + " 204.26\n", + " 2023\n", + " 12\n", + " 12\n", + " 16\n", + " 2\n", + " 27\n", + " 102.130\n", " \n", " \n", - " 2480.0\n", - " 81.38\n", - " 4.903953e+15\n", - " 4.042474e+15\n", - " 81.38\n", - " 1\n", - " 2021\n", - " 3\n", - " 20\n", + " 99996\n", + " Pension and insurances\n", + " 207.92\n", + " 4627516674144704\n", + " 4250420705087194\n", + " 207.92\n", + " 2024\n", + " 2\n", + " 8\n", + " 17\n", " 19\n", - " 56\n", - " 48\n", - " 40.690\n", + " 20\n", + " 103.960\n", " \n", " \n", "\n", - "

9999 rows × 12 columns

\n", + "

99997 rows × 12 columns

\n", "" ], "text/plain": [ - " amount_avg_1d receiver_id sender_id amount \\\n", - "transaction_id \n", - "106.0 879.39 4.601853e+15 4.274416e+15 879.39 \n", - "378.0 628.01 4.274544e+15 4.366884e+15 628.01 \n", - "368.0 89.69 4.601853e+15 4.161674e+15 89.69 \n", - "17.0 222.01 4.518552e+15 4.619387e+15 222.01 \n", - "178.0 418.52 4.274544e+15 4.456440e+15 418.52 \n", - "... ... ... ... ... \n", - "7192.0 45.85 4.957788e+15 4.353876e+15 45.85 \n", - "2935.0 57.44 4.116469e+15 4.801157e+15 57.44 \n", - "500.0 12.37 4.952788e+15 4.081148e+15 12.37 \n", - "1605.0 66.62 4.328906e+15 4.795968e+15 66.62 \n", - "2480.0 81.38 4.903953e+15 4.042474e+15 81.38 \n", + " transaction_category amount_avg_1d receiver_id \\\n", + "transaction_id \n", + "0 Uncategorized 833.26 4518551904499919 \n", + "1 Uncategorized 596.63 4518551904499919 \n", + "2 Uncategorized 176.76 4274544022939522 \n", + "3 Uncategorized 879.78 4518551904499919 \n", + "4 Uncategorized 742.25 4601853246125220 \n", + "... ... ... ... \n", + "99992 Pension and insurances 205.43 4405008355220324 \n", + "99993 Pension and insurances 151.49 4300416744511335 \n", + "99994 Pension and insurances 188.28 4405008355220324 \n", + "99995 Pension and insurances 204.26 4262047194499006 \n", + "99996 Pension and insurances 207.92 4627516674144704 \n", "\n", - " transaction_category_key timestamp_year timestamp_month \\\n", - "transaction_id \n", - "106.0 0 2021 1 \n", - "378.0 0 2021 1 \n", - "368.0 0 2021 1 \n", - "17.0 0 2021 1 \n", - "178.0 0 2021 1 \n", - "... ... ... ... \n", - "7192.0 1 2021 3 \n", - "2935.0 1 2021 3 \n", - "500.0 1 2021 3 \n", - "1605.0 1 2021 3 \n", - "2480.0 1 2021 3 \n", + " sender_id amount timestamp_year timestamp_month \\\n", + "transaction_id \n", + "0 4333582346477646 833.26 2024 1 \n", + "1 4642413144038776 596.63 2023 12 \n", + "2 4952665515556751 176.76 2023 12 \n", + "3 4457298962882528 879.78 2024 2 \n", + "4 4578126462896710 742.25 2024 1 \n", + "... ... ... ... ... \n", + "99992 4583355906735225 205.43 2024 2 \n", + "99993 4949240916846171 151.49 2024 1 \n", + "99994 4996896020767264 188.28 2024 1 \n", + "99995 4017367486513464 204.26 2023 12 \n", + "99996 4250420705087194 207.92 2024 2 \n", "\n", " timestamp_day timestamp_hour timestamp_minute \\\n", "transaction_id \n", - "106.0 1 15 7 \n", - "378.0 1 16 33 \n", - "368.0 1 18 17 \n", - "17.0 1 18 33 \n", - "178.0 1 19 33 \n", + "0 5 12 35 \n", + "1 9 10 30 \n", + "2 19 11 6 \n", + "3 4 8 51 \n", + "4 30 8 27 \n", "... ... ... ... \n", - "7192.0 20 19 45 \n", - "2935.0 20 19 50 \n", - "500.0 20 19 52 \n", - "1605.0 20 19 53 \n", - "2480.0 20 19 56 \n", + "99992 15 5 1 \n", + "99993 19 12 7 \n", + "99994 3 12 28 \n", + "99995 12 16 2 \n", + "99996 8 17 19 \n", "\n", " timestamp_second distance \n", "transaction_id \n", - "106.0 52 439.695 \n", - "378.0 53 314.005 \n", - "368.0 29 44.845 \n", - "17.0 18 111.005 \n", - "178.0 31 209.260 \n", + "0 2 416.630 \n", + "1 52 298.315 \n", + "2 52 88.380 \n", + "3 39 439.890 \n", + "4 36 371.125 \n", "... ... ... \n", - "7192.0 4 22.925 \n", - "2935.0 49 28.720 \n", - "500.0 54 6.185 \n", - "1605.0 24 33.310 \n", - "2480.0 48 40.690 \n", + "99992 13 102.715 \n", + "99993 38 75.745 \n", + "99994 30 94.140 \n", + "99995 27 102.130 \n", + "99996 20 103.960 \n", "\n", - "[9999 rows x 12 columns]" + "[99997 rows x 12 columns]" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ingested_data = extended_transactions_set.ingest(part_data, overwrite=True)\n", - "ingested_data" - ] - }, - { - "cell_type": "markdown", - "id": "b5e4834e", - "metadata": {}, - "source": [ - "We update the values in the feature store with the real values of our data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" + "#data = ingested_data.reset_index(drop=True)\n", + "data = ingested_data\n", + "data = data[['transaction_category'] + [col for col in data.columns if col != 'transaction_category']]\n", + "data" ] }, { - "cell_type": "markdown", - "id": "cf148985", + "cell_type": "code", + "execution_count": 22, + "id": "e1d377a5-cf7e-4564-8e14-10bfbaca4da2", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['transaction_category',\n", + " 'amount_avg_1d',\n", + " 'receiver_id',\n", + " 'sender_id',\n", + " 'amount',\n", + " 'timestamp_year',\n", + " 'timestamp_month',\n", + " 'timestamp_day',\n", + " 'timestamp_hour',\n", + " 'timestamp_minute',\n", + " 'timestamp_second',\n", + " 'distance']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" + "data_cols = list(data.columns)\n", + "data_cols" ] }, { - "cell_type": "markdown", - "id": "289eeca6", + "cell_type": "code", + "execution_count": 23, + "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", "metadata": {}, + "outputs": [], "source": [ - "### 4. Create model \n", - "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", "\n", + "# create feature vector on top of aggreagations\n", + "# Define the list of features we will be using\n", + "features = [f\"transactions.{name}\" for name in data_cols] \n", "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'transactions-vector'\n", "\n", - "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." + "# Define the feature vector using our Feature Store (fstore)\n", + "transactions_fv = fstore.FeatureVector(fv_name, \n", + " features, \n", + " description='stocks information')\n", + "\n", + "#label_feature = 'transactions-v2.transaction_category',\n", + "# Save the feature vector in the Feature Store\n", + "transactions_fv.save()" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "bb4bdd8d", + "execution_count": 29, + "id": "7737865e-21a1-4bfe-b24a-29925145280f", "metadata": {}, "outputs": [ { @@ -1411,7 +1432,7 @@ " \n", " \n", " \n", - " transaction_category_key\n", + " transaction_category\n", " amount_avg_1d\n", " receiver_id\n", " sender_id\n", @@ -1428,78 +1449,78 @@ " \n", " \n", " 0\n", - " 0\n", - " 879.39\n", - " 4.601853e+15\n", - " 4.274416e+15\n", - " 879.39\n", - " 2021\n", - " 1\n", + " Uncategorized\n", + " 833.26\n", + " 4518551904499919\n", + " 4333582346477646\n", + " 833.26\n", + " 2024\n", " 1\n", - " 15\n", - " 7\n", - " 52\n", - " 439.695\n", + " 5\n", + " 12\n", + " 35\n", + " 2\n", + " 416.630\n", " \n", " \n", " 1\n", - " 0\n", - " 628.01\n", - " 4.274544e+15\n", - " 4.366884e+15\n", - " 628.01\n", - " 2021\n", - " 1\n", - " 1\n", - " 16\n", - " 33\n", - " 53\n", - " 314.005\n", + " Uncategorized\n", + " 596.63\n", + " 4518551904499919\n", + " 4642413144038776\n", + " 596.63\n", + " 2023\n", + " 12\n", + " 9\n", + " 10\n", + " 30\n", + " 52\n", + " 298.315\n", " \n", " \n", " 2\n", - " 0\n", - " 89.69\n", - " 4.601853e+15\n", - " 4.161674e+15\n", - " 89.69\n", - " 2021\n", - " 1\n", - " 1\n", - " 18\n", - " 17\n", - " 29\n", - " 44.845\n", + " Uncategorized\n", + " 176.76\n", + " 4274544022939522\n", + " 4952665515556751\n", + " 176.76\n", + " 2023\n", + " 12\n", + " 19\n", + " 11\n", + " 6\n", + " 52\n", + " 88.380\n", " \n", " \n", " 3\n", - " 0\n", - " 222.01\n", - " 4.518552e+15\n", - " 4.619387e+15\n", - " 222.01\n", - " 2021\n", - " 1\n", - " 1\n", - " 18\n", - " 33\n", - " 18\n", - " 111.005\n", + " Uncategorized\n", + " 879.78\n", + " 4518551904499919\n", + " 4457298962882528\n", + " 879.78\n", + " 2024\n", + " 2\n", + " 4\n", + " 8\n", + " 51\n", + " 39\n", + " 439.890\n", " \n", " \n", " 4\n", - " 0\n", - " 418.52\n", - " 4.274544e+15\n", - " 4.456440e+15\n", - " 418.52\n", - " 2021\n", - " 1\n", + " Uncategorized\n", + " 742.25\n", + " 4601853246125220\n", + " 4578126462896710\n", + " 742.25\n", + " 2024\n", " 1\n", - " 19\n", - " 33\n", - " 31\n", - " 209.260\n", + " 30\n", + " 8\n", + " 27\n", + " 36\n", + " 371.125\n", " \n", " \n", " ...\n", @@ -1517,150 +1538,205 @@ " ...\n", " \n", " \n", - " 9994\n", + " 99992\n", + " Pension and insurances\n", + " 205.43\n", + " 4405008355220324\n", + " 4583355906735225\n", + " 205.43\n", + " 2024\n", + " 2\n", + " 15\n", + " 5\n", " 1\n", - " 45.85\n", - " 4.957788e+15\n", - " 4.353876e+15\n", - " 45.85\n", - " 2021\n", - " 3\n", - " 20\n", - " 19\n", - " 45\n", - " 4\n", - " 22.925\n", + " 13\n", + " 102.715\n", " \n", " \n", - " 9995\n", + " 99993\n", + " Pension and insurances\n", + " 151.49\n", + " 4300416744511335\n", + " 4949240916846171\n", + " 151.49\n", + " 2024\n", " 1\n", - " 57.44\n", - " 4.116469e+15\n", - " 4.801157e+15\n", - " 57.44\n", - " 2021\n", - " 3\n", - " 20\n", " 19\n", - " 50\n", - " 49\n", - " 28.720\n", + " 12\n", + " 7\n", + " 38\n", + " 75.745\n", " \n", " \n", - " 9996\n", + " 99994\n", + " Pension and insurances\n", + " 188.28\n", + " 4405008355220324\n", + " 4996896020767264\n", + " 188.28\n", + " 2024\n", " 1\n", - " 12.37\n", - " 4.952788e+15\n", - " 4.081148e+15\n", - " 12.37\n", - " 2021\n", " 3\n", - " 20\n", - " 19\n", - " 52\n", - " 54\n", - " 6.185\n", + " 12\n", + " 28\n", + " 30\n", + " 94.140\n", " \n", " \n", - " 9997\n", - " 1\n", - " 66.62\n", - " 4.328906e+15\n", - " 4.795968e+15\n", - " 66.62\n", - " 2021\n", - " 3\n", - " 20\n", - " 19\n", - " 53\n", - " 24\n", - " 33.310\n", + " 99995\n", + " Pension and insurances\n", + " 204.26\n", + " 4262047194499006\n", + " 4017367486513464\n", + " 204.26\n", + " 2023\n", + " 12\n", + " 12\n", + " 16\n", + " 2\n", + " 27\n", + " 102.130\n", " \n", " \n", - " 9998\n", - " 1\n", - " 81.38\n", - " 4.903953e+15\n", - " 4.042474e+15\n", - " 81.38\n", - " 2021\n", - " 3\n", - " 20\n", + " 99996\n", + " Pension and insurances\n", + " 207.92\n", + " 4627516674144704\n", + " 4250420705087194\n", + " 207.92\n", + " 2024\n", + " 2\n", + " 8\n", + " 17\n", " 19\n", - " 56\n", - " 48\n", - " 40.690\n", + " 20\n", + " 103.960\n", " \n", " \n", "\n", - "

9999 rows × 12 columns

\n", + "

99997 rows × 12 columns

\n", "" ], "text/plain": [ - " transaction_category_key amount_avg_1d receiver_id sender_id \\\n", - "0 0 879.39 4.601853e+15 4.274416e+15 \n", - "1 0 628.01 4.274544e+15 4.366884e+15 \n", - "2 0 89.69 4.601853e+15 4.161674e+15 \n", - "3 0 222.01 4.518552e+15 4.619387e+15 \n", - "4 0 418.52 4.274544e+15 4.456440e+15 \n", - "... ... ... ... ... \n", - "9994 1 45.85 4.957788e+15 4.353876e+15 \n", - "9995 1 57.44 4.116469e+15 4.801157e+15 \n", - "9996 1 12.37 4.952788e+15 4.081148e+15 \n", - "9997 1 66.62 4.328906e+15 4.795968e+15 \n", - "9998 1 81.38 4.903953e+15 4.042474e+15 \n", + " transaction_category amount_avg_1d receiver_id \\\n", + "0 Uncategorized 833.26 4518551904499919 \n", + "1 Uncategorized 596.63 4518551904499919 \n", + "2 Uncategorized 176.76 4274544022939522 \n", + "3 Uncategorized 879.78 4518551904499919 \n", + "4 Uncategorized 742.25 4601853246125220 \n", + "... ... ... ... \n", + "99992 Pension and insurances 205.43 4405008355220324 \n", + "99993 Pension and insurances 151.49 4300416744511335 \n", + "99994 Pension and insurances 188.28 4405008355220324 \n", + "99995 Pension and insurances 204.26 4262047194499006 \n", + "99996 Pension and insurances 207.92 4627516674144704 \n", "\n", - " amount timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", - "0 879.39 2021 1 1 15 \n", - "1 628.01 2021 1 1 16 \n", - "2 89.69 2021 1 1 18 \n", - "3 222.01 2021 1 1 18 \n", - "4 418.52 2021 1 1 19 \n", - "... ... ... ... ... ... \n", - "9994 45.85 2021 3 20 19 \n", - "9995 57.44 2021 3 20 19 \n", - "9996 12.37 2021 3 20 19 \n", - "9997 66.62 2021 3 20 19 \n", - "9998 81.38 2021 3 20 19 \n", + " sender_id amount timestamp_year timestamp_month \\\n", + "0 4333582346477646 833.26 2024 1 \n", + "1 4642413144038776 596.63 2023 12 \n", + "2 4952665515556751 176.76 2023 12 \n", + "3 4457298962882528 879.78 2024 2 \n", + "4 4578126462896710 742.25 2024 1 \n", + "... ... ... ... ... \n", + "99992 4583355906735225 205.43 2024 2 \n", + "99993 4949240916846171 151.49 2024 1 \n", + "99994 4996896020767264 188.28 2024 1 \n", + "99995 4017367486513464 204.26 2023 12 \n", + "99996 4250420705087194 207.92 2024 2 \n", "\n", - " timestamp_minute timestamp_second distance \n", - "0 7 52 439.695 \n", - "1 33 53 314.005 \n", - "2 17 29 44.845 \n", - "3 33 18 111.005 \n", - "4 33 31 209.260 \n", - "... ... ... ... \n", - "9994 45 4 22.925 \n", - "9995 50 49 28.720 \n", - "9996 52 54 6.185 \n", - "9997 53 24 33.310 \n", - "9998 56 48 40.690 \n", + " timestamp_day timestamp_hour timestamp_minute timestamp_second \\\n", + "0 5 12 35 2 \n", + "1 9 10 30 52 \n", + "2 19 11 6 52 \n", + "3 4 8 51 39 \n", + "4 30 8 27 36 \n", + "... ... ... ... ... \n", + "99992 15 5 1 13 \n", + "99993 19 12 7 38 \n", + "99994 3 12 28 30 \n", + "99995 12 16 2 27 \n", + "99996 8 17 19 20 \n", "\n", - "[9999 rows x 12 columns]" + " distance \n", + "0 416.630 \n", + "1 298.315 \n", + "2 88.380 \n", + "3 439.890 \n", + "4 371.125 \n", + "... ... \n", + "99992 102.715 \n", + "99993 75.745 \n", + "99994 94.140 \n", + "99995 102.130 \n", + "99996 103.960 \n", + "\n", + "[99997 rows x 12 columns]" ] }, - "execution_count": 48, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = ingested_data.reset_index(drop=True)\n", - "data = data[['transaction_category_key'] + [col for col in data.columns if col != 'transaction_category_key']]\n", + "import mlrun.feature_store as fs\n", + "resp = fs.FeatureVector.get_offline_features(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector:latest\")\n", + "#Preview the dataset\n", + "fv_data = resp.to_dataframe()\n", + "fv_data\n", "\n", - "data" + "# svc = fs.FeatureVector.get_online_feature_service(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector:latest\")\n", + "# resp = svc.get([{\"transaction_id\": \"99996\"}])\n", + "# resp" + ] + }, + { + "cell_type": "markdown", + "id": "b5e4834e", + "metadata": {}, + "source": [ + "We update the values in the feature store with the real values of our data" + ] + }, + { + "cell_type": "markdown", + "id": "e2f6395f", + "metadata": {}, + "source": [ + "And display them after getting them from the feature store" + ] + }, + { + "cell_type": "markdown", + "id": "cf148985", + "metadata": {}, + "source": [ + "We use the feature store to calculate the distance between the average of every category and the current amount" + ] + }, + { + "cell_type": "markdown", + "id": "289eeca6", + "metadata": {}, + "source": [ + "### 4. Create model \n", + "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", + "\n", + "\n", + "\n", + "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 30, "id": "47512de3-60ac-49c7-ace8-031959527e86", "metadata": {}, "outputs": [], "source": [ "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", "train_data, validation_data, test_data = np.split(\n", - " data.sample(frac=1, random_state=42), [int(0.7 * len(data)), int(0.9 * len(data))]\n", + " fv_data.sample(frac=1, random_state=42), [int(0.7 * len(fv_data)), int(0.9 * len(fv_data))]\n", ")" ] }, @@ -1674,7 +1750,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 31, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -1694,20 +1770,10 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 32, "id": "e1ca2543", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:botocore.credentials:Found credentials in environment variables.\n", - "INFO:botocore.credentials:Found credentials in environment variables.\n", - "INFO:botocore.credentials:Found credentials in environment variables.\n" - ] - } - ], + "outputs": [], "source": [ "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", " os.path.join(bucket_prefix, \"train/train.csv\")\n", @@ -1730,18 +1796,10 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 33, "id": "a41b6a7d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n" - ] - } - ], + "outputs": [], "source": [ "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" ] @@ -1756,7 +1814,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 34, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1779,7 +1837,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 35, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1804,7 +1862,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 36, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1832,7 +1890,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 37, "id": "c24e06fc", "metadata": { "scrolled": true @@ -1842,143 +1900,143 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-07-17-07-42-480\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-08-17-31-38-814\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-07 17:07:42 Starting - Starting the training job......\n", - "2024-02-07 17:08:18 Starting - Preparing the instances for training......\n", - "2024-02-07 17:09:17 Downloading - Downloading input data...\n", - "2024-02-07 17:09:47 Downloading - Downloading the training image...\n", - "2024-02-07 17:10:38 Training - Training image download completed. Training in progress...\n", - "2024-02-07 17:11:08 Uploading - Uploading generated training model\u001b[34m[2024-02-07 17:10:55.079 ip-10-0-147-106.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-08 17:31:38 Starting - Starting the training job...\n", + "2024-02-08 17:32:02 Starting - Preparing the instances for training.........\n", + "2024-02-08 17:33:23 Downloading - Downloading input data...\n", + "2024-02-08 17:33:53 Downloading - Downloading the training image......\n", + "2024-02-08 17:34:43 Training - Training image download completed. Training in progress.\u001b[34m[2024-02-08 17:35:00.315 ip-10-0-178-178.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Train matrix has 6999 rows and 11 columns\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Validation matrix has 2000 rows\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.166 ip-10-0-147-106.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.166 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.167 ip-10-0-147-106.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.168 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.168 ip-10-0-147-106.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-07:17:10:55:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.289 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-07 17:10:55.291 ip-10-0-147-106.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.00700#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.00686#011validation-merror:0.00700\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.00657#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.00643#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.00629#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.00629#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.00629#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.00614#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.00586#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.00571#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.00557#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.00529#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.00543#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.00514#011validation-merror:0.00650\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.00486#011validation-merror:0.00650\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Train matrix has 69997 rows and 11 columns\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.471 ip-10-0-178-178.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.472 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.472 ip-10-0-178-178.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.473 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.473 ip-10-0-178-178.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-08:17:35:00:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.749 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-08 17:35:00.752 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", "\n", - "2024-02-07 17:11:19 Completed - Training job completed\n", - "Training seconds: 122\n", - "Billable seconds: 122\n" + "2024-02-08 17:35:29 Uploading - Uploading generated training model\n", + "2024-02-08 17:35:45 Completed - Training job completed\n", + "Training seconds: 142\n", + "Billable seconds: 142\n" ] } ], @@ -1998,17 +2056,17 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 38, "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-07-17-07-42-480/output/model.tar.gz'" + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-08-17-31-38-814/output/model.tar.gz'" ] }, - "execution_count": 57, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2019,7 +2077,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 39, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -2029,7 +2087,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 40, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, "outputs": [ @@ -2042,61 +2100,49 @@ "\n", "\n", - "\n", + "\n", "\n", "mlrun-flow\n", - "\n", + "\n", "\n", "\n", "_start\n", "\n", "start\n", "\n", - "\n", - "\n", - "preprocess\n", - "\n", - "preprocess\n", - "\n", - "\n", - "\n", - "_start->preprocess\n", - "\n", - "\n", - "\n", "\n", - "\n", + "\n", "xgboost-model\n", - "\n", - "xgboost-model\n", + "\n", + "xgboost-model\n", "\n", - "\n", - "\n", - "preprocess->xgboost-model\n", - "\n", - "\n", + "\n", + "\n", + "_start->xgboost-model\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "postprocess\n", - "\n", - "postprocess\n", + "\n", + "postprocess\n", "\n", "\n", - "\n", + "\n", "xgboost-model->postprocess\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 59, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2106,8 +2152,7 @@ "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", "\n", "# Add the steps:\n", - "graph.to(handler=\"preprocess\", name=\"preprocess\") \\\n", - " .to(\"XGBModelServer\",\n", + "graph.to(\"XGBModelServer\",\n", " name=\"xgboost-model\",\n", " model_path=xgb.model_data) \\\n", " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", @@ -2118,124 +2163,10 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-07 17:13:49,933 [info] Starting remote function deploy\n", - "2024-02-07 17:13:50 (info) Deploying function\n", - "2024-02-07 17:13:50 (info) Building\n", - "2024-02-07 17:13:50 (info) Staging files and preparing base images\n", - "2024-02-07 17:13:50 (info) Building processor image\n", - "2024-02-07 17:15:36 (info) Build complete\n", - "Failed to deploy. Details:\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", - " self.load()\n", - " File \"/opt/nuclio/serving.py\", line 21, in load\n", - " model_file, extra_data = self.get_model(\".tar.gz\")\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", - " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", - " obj.download(temp_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", - " self._store.download(self._path, target_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", - " data = self.get(key)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", - " return obj.get()[\"Body\"].read()\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", - " response = action(self, *args, **kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", - " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", - " return self._make_api_call(operation_name, kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", - " raise error_class(parsed_response, operation_name)\n", - "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - " [worker_id=0]\n", - "Exception raised while running init_context [worker_id=0]\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", - " self.load()\n", - " File \"/opt/nuclio/serving.py\", line 21, in load\n", - " model_file, extra_data = self.get_model(\".tar.gz\")\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", - " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", - " obj.download(temp_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", - " self._store.download(self._path, target_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", - " data = self.get(key)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", - " return obj.get()[\"Body\"].read()\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", - " response = action(self, *args, **kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", - " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", - " return self._make_api_call(operation_name, kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", - " raise error_class(parsed_response, operation_name)\n", - "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 480, in \n", - " run_wrapper()\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 468, in run_wrapper\n", - " loop.run_until_complete(wrapper_instance.initialize())\n", - " File \"/opt/conda/lib/python3.9/asyncio/base_events.py\", line 647, in run_until_complete\n", - " return future.result()\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 165, in initialize\n", - " await self._initialize_context()\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 188, in _initialize_context\n", - " init_context_result = getattr(self._entrypoint_module, 'init_context')(self._context)\n", - " File \"/opt/nuclio/serving.py\", line 135, in init_context\n", - " nuclio_init_hook(context, globals(), 'serving_v2')\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/nuclio.py\", line 34, in nuclio_init_hook\n", - " v2_serving_init(context, data)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 349, in v2_serving_init\n", - " serving_handler = server.init_object(namespace or get_caller_globals())\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 192, in init_object\n", - " self.graph.init_object(self.context, namespace, self.load_mode, reset=True)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 917, in init_object\n", - " step.init_object(context, namespace, mode, reset=reset)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 444, in init_object\n", - " self._post_init(mode)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 502, in _post_init\n", - " self._object.post_init(mode)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 143, in post_init\n", - " self._load_and_update_state()\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 131, in _load_and_update_state\n", - " raise RuntimeError(f\"failed to load model {self.name}\") from exc\n", - "RuntimeError: failed to load model xgboost-model\n", - "> 2024-02-07 17:15:51,622 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" - ] - }, - { - "ename": "RunError", - "evalue": "Function serving deployment failed", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[60], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:586\u001b[0m, in \u001b[0;36mRemoteRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_credentials_from_remote_build(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 584\u001b[0m \u001b[38;5;66;03m# when a function is deployed, we wait for it to be ready by default\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;66;03m# this also means that the function object will be updated with the function status\u001b[39;00m\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_function_deployment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[38;5;66;03m# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort\u001b[39;00m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# now, functions can be not exposed (using service type ClusterIP) and hence\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;66;03m# for BC we first try to populate the external invocation url, and then\u001b[39;00m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;66;03m# if not exists, take the internal invocation url\u001b[39;00m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mexternal_invocation_urls:\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:633\u001b[0m, in \u001b[0;36mRemoteRuntime._wait_for_function_deployment\u001b[0;34m(self, db, verbose)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 632\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNuclio function failed to deploy\u001b[39m\u001b[38;5;124m\"\u001b[39m, function_state\u001b[38;5;241m=\u001b[39mstate)\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RunError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFunction \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m deployment failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mRunError\u001b[0m: Function serving deployment failed" - ] - } - ], + "outputs": [], "source": [ "project.deploy_function(\"serving\")" ] diff --git a/serving-Copy1.ipynb b/serving-Copy1.ipynb index 8fcf57d..78ab73d 100644 --- a/serving-Copy1.ipynb +++ b/serving-Copy1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 24, "id": "1b3d7eb9-b601-47b4-a914-191e5bcf2764", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 25, "id": "41fa803e-cd2c-46ff-ba0c-6ff6d7b0b92c", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 26, "id": "2c7bb858-9603-4c67-92c0-722b0cf24714", "metadata": { "scrolled": true @@ -33,7 +33,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-08 10:46:56,324 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" + "> 2024-02-08 17:39:51,937 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" ] } ], @@ -50,17 +50,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 27, "id": "74dab54c-6348-4a18-9db5-5d8074370fb0", "metadata": {}, "outputs": [], "source": [ - "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-07-17-07-42-480/output/model.tar.gz'" + "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-08-17-31-38-814/output/model.tar.gz'" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 28, "id": "3ee1d9bd-2652-4349-8df5-e231edb6acfa", "metadata": {}, "outputs": [], @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 29, "id": "3da03265-204e-4600-8746-adc81f7ce3bf", "metadata": {}, "outputs": [], @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 30, "id": "b004f4de-cab6-47b6-b786-07b0601eac82", "metadata": {}, "outputs": [], @@ -128,79 +128,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1b81e491-e4d8-4253-8f6d-65b5a8a74640", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "41719e95-2d93-437f-8ace-57963e281d9d", - "metadata": {}, - "outputs": [], - "source": [ - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "# create feature vector on top of aggreagations\n", - "# Define the list of features we will be using\n", - "features = [f\"transactions-v13.{name}\" for name in data_cols] \n", - "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'transactions-vector-v3'\n", - "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "transactions_fv = fstore.FeatureVector(fv_name, \n", - " features, \n", - " description='stocks information')\n", - "\n", - "#label_feature = 'transactions-v2.transaction_category',\n", - "# Save the feature vector in the Feature Store\n", - "transactions_fv.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "81b398c4-4aa0-4746-8a4c-e8c745027692", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun.feature_store as fs\n", - "# resp = fs.FeatureVector.get_offline_features(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector-v3:latest\")\n", - "# #Preview the dataset\n", - "# resp.to_dataframe().tail(5)\n", - "\n", - "svc = fs.FeatureVector.get_online_feature_service(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector-v3:latest\")\n", - "resp = svc.get([{\"transaction_id\": \"42\"}])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "04f288c0-11bc-45a1-ba23-c435e1696aa4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[None]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "resp" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "82817363-e449-4c0b-8527-1642b74c9406", "metadata": {}, "outputs": [], @@ -220,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "id": "e07c6b0b-7c9a-4c44-bb08-ea8a865a043c", "metadata": {}, "outputs": [], @@ -230,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "id": "6a291b9c-0acc-4807-ab8e-4bec180a2bbf", "metadata": {}, "outputs": [ @@ -281,22 +209,24 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 29, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", - "# test_serving_function.set_topology(\n", - "# \"router\",\n", - "# mlrun.serving.routers.EnrichmentModelRouter(\n", - "# feature_vector_uri=\"store://feature-vectors/sagemaker-v3-admin/transactions-vector-v3:latest\",\n", - "# impute_policy={\"*\": \"$mean\"}),\n", - "# )\n", + "graph = test_serving_function.set_topology(\n", + " \"router\",\n", + " mlrun.serving.routers.EnrichmentModelRouter(\n", + " feature_vector_uri=\"store://feature-vectors/sagemaker-v3-admin/transactions-vector:latest\",\n", + " impute_policy={\"*\": \"$mean\"}),\n", + ")\n", + "\n", + "#graph.to(handler=\"postprocess\", name=\"postprocess\").respond()\n", "# # add the 3 trained models to the Ensemble\n", "# for model in project.list_models('', tag='latest'):\n", "# name = model.spec.db_key\n", @@ -310,7 +240,15 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, + "id": "58874abf-f7c8-4f40-83d0-8beb92dfd550", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, "id": "fb976023-5a2c-4dc8-b1b7-fd897446b747", "metadata": {}, "outputs": [ @@ -318,8 +256,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-08 11:07:24,154 [info] model xgboost-model was loaded\n", - "> 2024-02-08 11:07:24,155 [info] Loaded ['xgboost-model']\n" + "> 2024-02-08 17:41:19,924 [info] model xgboost-model was loaded\n", + "> 2024-02-08 17:41:19,925 [info] Loaded ['xgboost-model']\n" ] } ], @@ -329,39 +267,29 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 33, "id": "35e98782-129d-4ffb-b27e-d580589d6106", "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "data = pd.read_csv(\n", - " \"test.csv\")" + "# import pandas as pd\n", + "# data = pd.read_csv(\n", + "# \"test.csv\")" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "dfd10537-482a-404b-b765-b0db5cbb497b", - "metadata": {}, - "outputs": [], - "source": [ - "data = data.drop('transaction_category_key', axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, + "execution_count": 34, "id": "8f67e627-cfbf-4b8b-a5b3-7b9e836f779a", "metadata": {}, "outputs": [], "source": [ - "data = data[:1]" + "# data = data[:1]" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 35, "id": "90abdb9d-3140-45eb-9a4d-a83a74b95700", "metadata": {}, "outputs": [ @@ -386,6 +314,7 @@ " \n", " \n", " \n", + " transaction_category\n", " amount_avg_1d\n", " receiver_id\n", " sender_id\n", @@ -402,40 +331,41 @@ " \n", " \n", " 0\n", - " 53.07\n", - " 4.726537e+15\n", - " 4.070479e+15\n", - " 53.07\n", - " 2021\n", - " 2\n", - " 19\n", - " 19\n", - " 37\n", - " 40\n", - " 26.535\n", + " Shopping\n", + " 67.18\n", + " 4630518417004166\n", + " 4898290331783278\n", + " 67.18\n", + " 2023\n", + " 11\n", + " 7\n", + " 16\n", + " 34\n", + " 30\n", + " 33.59\n", " \n", " \n", "\n", "" ], "text/plain": [ - " amount_avg_1d receiver_id sender_id amount timestamp_year \\\n", - "0 53.07 4.726537e+15 4.070479e+15 53.07 2021 \n", + " transaction_category amount_avg_1d receiver_id sender_id \\\n", + "0 Shopping 67.18 4630518417004166 4898290331783278 \n", "\n", - " timestamp_month timestamp_day timestamp_hour timestamp_minute \\\n", - "0 2 19 19 37 \n", + " amount timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", + "0 67.18 2023 11 7 16 \n", "\n", - " timestamp_second distance \n", - "0 40 26.535 " + " timestamp_minute timestamp_second distance \n", + "0 34 30 33.59 " ] }, - "execution_count": 42, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data" + "# data" ] }, { @@ -466,8 +396,8 @@ } ], "source": [ - "data_ls = data.values.tolist()\n", - "data_ls" + "# data_ls = data.values.tolist()\n", + "# data_ls" ] }, { @@ -477,12 +407,12 @@ "metadata": {}, "outputs": [], "source": [ - "inputs_data = {'inputs': data_ls }" + "#inputs_data = {'inputs': data_ls }" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 37, "id": "dff40293-9d50-400c-9a1b-62a7e610e176", "metadata": {}, "outputs": [ @@ -490,38 +420,69 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-08 11:14:12,578 [error] run error, Traceback (most recent call last):\n", + "{'inputs': [['Pension and insurances', 207.92, 4627516674144704, 4250420705087194, 207.92, 2024, 2, 8, 17, 19, 20, 103.96]]}\n", + "> 2024-02-08 17:42:48,779 [error] run error, Traceback (most recent call last):\n", " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py\", line 280, in run\n", " response = self.graph.run(event, **(extra_args or {}))\n", " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 548, in run\n", " raise exc\n", " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 531, in run\n", " return self._handler(event, *args, **kwargs)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 148, in do_event\n", - " event = self.preprocess(event)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 1174, in preprocess\n", - " event.body[\"inputs\"] = self._feature_service.get(\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/feature_store/feature_vector.py\", line 892, in get\n", - " raise mlrun.errors.MLRunInvalidArgumentError(\n", - "mlrun.errors.MLRunInvalidArgumentError: input list must be in the same size of the index_keys list\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 151, in do_event\n", + " event = self.postprocess(self._handle_event(event))\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 209, in _handle_event\n", + " response = route.run(event)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 548, in run\n", + " raise exc\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 531, in run\n", + " return self._handler(event, *args, **kwargs)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 249, in do_event\n", + " raise exc\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 244, in do_event\n", + " outputs = self.predict(request)\n", + " File \"src/functions/serving.py\", line 39, in predict\n", + " data = xgb.DMatrix(data)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/core.py\", line 730, in inner_f\n", + " return func(**kwargs)\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/core.py\", line 857, in __init__\n", + " handle, feature_names, feature_types = dispatch_data_backend(\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/data.py\", line 1075, in dispatch_data_backend\n", + " return _from_numpy_array(\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/data.py\", line 207, in _from_numpy_array\n", + " _check_call(\n", + " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/core.py\", line 282, in _check_call\n", + " raise XGBoostError(py_str(_LIB.XGBGetLastError()))\n", + "xgboost.core.XGBoostError: [17:42:48] /workspace/src/c_api/../data/array_interface.h:492: Unicode-3 is not supported.\n", + "Stack trace:\n", + " [bt] (0) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x15c2ca) [0x7f406c4212ca]\n", + " [bt] (1) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x19ff29) [0x7f406c464f29]\n", + " [bt] (2) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x16c) [0x7f406c435a6c]\n", + " [bt] (3) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7f411ac7f052]\n", + " [bt] (4) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7f411ac7d925]\n", + " [bt] (5) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f411ac7e06e]\n", + " [bt] (6) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7f411ac8f1e0]\n", + " [bt] (7) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x8568) [0x7f411ac8e568]\n", + " [bt] (8) /home/sagemaker-user/.conda/envs/smdemo/bin/python3.9(_PyObject_MakeTpCall+0x2ec) [0x4f073c]\n", + "\n", + "\n", "\n" ] }, { "ename": "RuntimeError", - "evalue": "failed (400): MLRunInvalidArgumentError: input list must be in the same size of the index_keys list", + "evalue": "failed (400): XGBoostError: [17:42:48] /workspace/src/c_api/../data/array_interface.h:492: Unicode-3 is not supported.\nStack trace:\n [bt] (0) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x15c2ca) [0x7f406c4212ca]\n [bt] (1) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x19ff29) [0x7f406c464f29]\n [bt] (2) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x16c) [0x7f406c435a6c]\n [bt] (3) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7f411ac7f052]\n [bt] (4) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7f411ac7d925]\n [bt] (5) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f411ac7e06e]\n [bt] (6) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7f411ac8f1e0]\n [bt] (7) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x8568) [0x7f411ac8e568]\n [bt] (8) /home/sagemaker-user/.conda/envs/smdemo/bin/python3.9(_PyObject_MakeTpCall+0x2ec) [0x4f073c]\n\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[48], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_data\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[37], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43minputs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m99996\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py:250\u001b[0m, in \u001b[0;36mGraphServer.test\u001b[0;34m(self, path, body, method, headers, content_type, silent, get_body, event_id, trigger, offset, time)\u001b[0m\n\u001b[1;32m 248\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun(event, get_body\u001b[38;5;241m=\u001b[39mget_body)\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(resp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus_code\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m resp\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m300\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m silent:\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfailed (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", - "\u001b[0;31mRuntimeError\u001b[0m: failed (400): MLRunInvalidArgumentError: input list must be in the same size of the index_keys list" + "\u001b[0;31mRuntimeError\u001b[0m: failed (400): XGBoostError: [17:42:48] /workspace/src/c_api/../data/array_interface.h:492: Unicode-3 is not supported.\nStack trace:\n [bt] (0) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x15c2ca) [0x7f406c4212ca]\n [bt] (1) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x19ff29) [0x7f406c464f29]\n [bt] (2) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x16c) [0x7f406c435a6c]\n [bt] (3) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7f411ac7f052]\n [bt] (4) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7f411ac7d925]\n [bt] (5) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f411ac7e06e]\n [bt] (6) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7f411ac8f1e0]\n [bt] (7) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x8568) [0x7f411ac8e568]\n [bt] (8) /home/sagemaker-user/.conda/envs/smdemo/bin/python3.9(_PyObject_MakeTpCall+0x2ec) [0x4f073c]\n\n" ] } ], "source": [ - "response = server.test(body=inputs_data)" + "response = server.test(body={'inputs':[[99996]]})" ] }, { diff --git a/src/functions/serving.py b/src/functions/serving.py index 6b6a3f7..214239c 100644 --- a/src/functions/serving.py +++ b/src/functions/serving.py @@ -30,6 +30,8 @@ def load(self): def predict(self, body: dict) -> List: """Generate model predictions from sample.""" + print(body) + # Convert input to numpy array: data = np.asarray(body["inputs"]) @@ -75,58 +77,58 @@ def postprocess(inputs: dict) -> dict: inputs["confidences"] = confidences return inputs -def get_realtime_transactions_aggregations(): - # Create a feature vector that gets the average amount - vector = fstore.FeatureVector("aggregations-vector", ["aggregations.amount_avg_1d"], with_indexes=True) - #get the categories list - unique_categories = ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"] - # Use online feature service to get the latest average amount per category - with vector.get_online_feature_service() as online_feature_service: - resp = online_feature_service.get( - [{"transaction_category":cat} for cat in unique_categories] - ) - return resp - -def calculate_distances(resp, event): - for cat in resp: - transaction_category = cat['transaction_category'] - amount_avg = cat['amount_avg_1d'] - event[0]["dist_" + transaction_category] = abs(amount_avg - event[0]["amount"]) - - return event - -def convert_timestamp_to_components(event): - event[0]["year"] = event[0]["timestamp"].year - event[0]["month"] = event[0]["timestamp"].month - event[0]["day"] = event[0]["timestamp"].day - event[0]["hour"] = event[0]["timestamp"].hour - event[0]["minute"] = event[0]["timestamp"].minute - event[0]["second"] = event[0]["timestamp"].second - del event[0]['timestamp'] - - return event - -def move_to_end(ls, key): - """Move an item to the end of the dictionary.""" - d = ls[0] - if key in d: - value = d.pop(key) # Remove the item and get its value - d[key] = value # Reinsert the item, which moves it to the end - ls[0] = d - return ls - - - - -# Function that preprocesses the inference data -def preprocess(event): - resp = get_realtime_transactions_aggregations() - dist_event = calculate_distances(resp, event) - converted_event = convert_timestamp_to_components(event) - restructured_event = move_to_end(converted_event,'transaction_id') - values_list = list(restructured_event[0].values()) - return_list = [values_list] - return_list - return_dict = {"inputs": return_list} - return return_dict +# def get_realtime_transactions_aggregations(): +# # Create a feature vector that gets the average amount +# vector = fstore.FeatureVector("aggregations-vector", ["aggregations.amount_avg_1d"], with_indexes=True) +# #get the categories list +# unique_categories = ["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"] +# # Use online feature service to get the latest average amount per category +# with vector.get_online_feature_service() as online_feature_service: +# resp = online_feature_service.get( +# [{"transaction_category":cat} for cat in unique_categories] +# ) +# return resp + +# def calculate_distances(resp, event): +# for cat in resp: +# transaction_category = cat['transaction_category'] +# amount_avg = cat['amount_avg_1d'] +# event[0]["dist_" + transaction_category] = abs(amount_avg - event[0]["amount"]) + +# return event + +# def convert_timestamp_to_components(event): +# event[0]["year"] = event[0]["timestamp"].year +# event[0]["month"] = event[0]["timestamp"].month +# event[0]["day"] = event[0]["timestamp"].day +# event[0]["hour"] = event[0]["timestamp"].hour +# event[0]["minute"] = event[0]["timestamp"].minute +# event[0]["second"] = event[0]["timestamp"].second +# del event[0]['timestamp'] + +# return event + +# def move_to_end(ls, key): +# """Move an item to the end of the dictionary.""" +# d = ls[0] +# if key in d: +# value = d.pop(key) # Remove the item and get its value +# d[key] = value # Reinsert the item, which moves it to the end +# ls[0] = d +# return ls + + + + +# # Function that preprocesses the inference data +# def preprocess(event): +# resp = get_realtime_transactions_aggregations() +# dist_event = calculate_distances(resp, event) +# converted_event = convert_timestamp_to_components(event) +# restructured_event = move_to_end(converted_event,'transaction_id') +# values_list = list(restructured_event[0].values()) +# return_list = [values_list] +# return_list +# return_dict = {"inputs": return_list} +# return return_dict \ No newline at end of file diff --git a/utils.py b/utils.py index b5eb3a5..4ec4cf3 100644 --- a/utils.py +++ b/utils.py @@ -1,46 +1,63 @@ import pandas as pd -from datetime import datetime, timedelta +import datetime -# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day) -def update_timestamps(data): - # Get today's date - today = datetime.today() +#from datetime import datetime, timedelta - # Calculate the dates for the last 5 days - last_5_days = [today - timedelta(days=i) for i in range(4, -1, -1)] # Reverse for chronological order +def update_timestamps(data): + + # Step 3: Get the current time + now = pd.Timestamp(datetime.datetime.now()) + + # Step 4: Calculate the time difference + time_difference = now - data['timestamp'].iloc[-1] + + # Step 5: Adjust all timestamps + data['timestamp'] = data['timestamp'] + time_difference + + # Display the adjusted DataFrame + return data - # Extract year, month, and day from each date object - years = [d.year for d in last_5_days] - months = [d.month for d in last_5_days] - days = [d.day for d in last_5_days] - hours = [10, 15] +# # Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day) +# def update_timestamps(data): +# # Get today's date +# today = datetime.today() - # Create a list of timestamps of the last 5 days, 2 timestamps per day. - times = [] - for year, month, day in zip(years, months, days): - for hour in hours: - times.append(datetime(year, month, day, hour)) +# # Calculate the dates for the last 5 days +# last_5_days = [today - timedelta(days=i) for i in range(4, -1, -1)] # Reverse for chronological order - # Iterate over each transaction category - for i in range(len(data["transaction_category_key"].unique())): - # Extract all the rows for each category - category_data = data[data['transaction_category_key'] == str(i)] +# # Extract year, month, and day from each date object +# years = [d.year for d in last_5_days] +# months = [d.month for d in last_5_days] +# days = [d.day for d in last_5_days] - # Ensure timestamp is a datetime object - pd.to_datetime(category_data.timestamp) +# hours = [10, 15] - # Sort DataFrame by timestamp in descending order - category_data_sorted = category_data.sort_values(by='timestamp', ascending=False) +# # Create a list of timestamps of the last 5 days, 2 timestamps per day. +# times = [] +# for year, month, day in zip(years, months, days): +# for hour in hours: +# times.append(datetime(year, month, day, hour)) - # Select the latest rows and update their timestamp - latest_rows = category_data_sorted.head(len(times)) - latest_rows.loc[:, 'timestamp'] = times +# # Iterate over each transaction category +# for i in range(len(data["transaction_category"].unique())): +# # Extract all the rows for each category +# category_data = data[data['transaction_category'] == str(i)] - # Update the initial dataframe to include those updated rows - data.update(latest_rows) +# # Ensure timestamp is a datetime object +# pd.to_datetime(category_data.timestamp) - data.sort_values(["transaction_category_key", "timestamp"], inplace=True) +# # Sort DataFrame by timestamp in descending order +# category_data_sorted = category_data.sort_values(by='timestamp', ascending=False) +# # Select the latest rows and update their timestamp +# latest_rows = category_data_sorted.head(len(times)) +# latest_rows.loc[:, 'timestamp'] = times - return data \ No newline at end of file +# # Update the initial dataframe to include those updated rows +# data.update(latest_rows) + +# data.sort_values(["transaction_category", "timestamp"], inplace=True) + + +# return data \ No newline at end of file From 6fb7088ea12cc96bf6cbde82bab6ea7131b693c0 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Sun, 11 Feb 2024 13:01:57 +0000 Subject: [PATCH 08/16] upadting payment serving --- financial_payment_classification_v3.ipynb | 2547 +++++++++++++-------- 1 file changed, 1594 insertions(+), 953 deletions(-) diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb index 60e8dea..6cc8ec8 100644 --- a/financial_payment_classification_v3.ipynb +++ b/financial_payment_classification_v3.ipynb @@ -108,13 +108,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-08 17:19:18,429 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" + "> 2024-02-11 11:55:08,243 [info] Project loaded successfully: {'project_name': 'sagemaker-v8'}\n" ] } ], "source": [ "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v3\", \n", + " name=\"sagemaker-v8\", \n", " user_project=True,\n", " parameters={\n", " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", @@ -352,7 +352,7 @@ " 4518551904499919\n", " 4333582346477646\n", " 833.26\n", - " 2024-01-05 12:35:02.641158\n", + " 2024-01-08 07:10:52.429358\n", " \n", " \n", " 1\n", @@ -360,7 +360,7 @@ " 4518551904499919\n", " 4642413144038776\n", " 596.63\n", - " 2023-12-09 10:30:52.641158\n", + " 2023-12-12 05:06:42.429358\n", " \n", " \n", " 2\n", @@ -368,7 +368,7 @@ " 4274544022939522\n", " 4952665515556751\n", " 176.76\n", - " 2023-12-19 11:06:52.641158\n", + " 2023-12-22 05:42:42.429358\n", " \n", " \n", " 3\n", @@ -376,7 +376,7 @@ " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", - " 2024-02-04 08:51:39.641158\n", + " 2024-02-07 03:27:29.429358\n", " \n", " \n", " 4\n", @@ -384,7 +384,7 @@ " 4601853246125220\n", " 4578126462896710\n", " 742.25\n", - " 2024-01-30 08:27:36.641158\n", + " 2024-02-02 03:03:26.429358\n", " \n", " \n", " ...\n", @@ -400,7 +400,7 @@ " 4405008355220324\n", " 4583355906735225\n", " 205.43\n", - " 2024-02-15 05:01:13.641158\n", + " 2024-02-17 23:37:03.429358\n", " \n", " \n", " 99993\n", @@ -408,7 +408,7 @@ " 4300416744511335\n", " 4949240916846171\n", " 151.49\n", - " 2024-01-19 12:07:38.641158\n", + " 2024-01-22 06:43:28.429358\n", " \n", " \n", " 99994\n", @@ -416,7 +416,7 @@ " 4405008355220324\n", " 4996896020767264\n", " 188.28\n", - " 2024-01-03 12:28:30.641158\n", + " 2024-01-06 07:04:20.429358\n", " \n", " \n", " 99995\n", @@ -424,7 +424,7 @@ " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2023-12-12 16:02:27.641158\n", + " 2023-12-15 10:38:17.429358\n", " \n", " \n", " 99996\n", @@ -432,7 +432,7 @@ " 4627516674144704\n", " 4250420705087194\n", " 207.92\n", - " 2024-02-08 17:19:20.641158\n", + " 2024-02-11 11:55:10.429358\n", " \n", " \n", "\n", @@ -454,17 +454,17 @@ "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", "\n", " timestamp \n", - "0 2024-01-05 12:35:02.641158 \n", - "1 2023-12-09 10:30:52.641158 \n", - "2 2023-12-19 11:06:52.641158 \n", - "3 2024-02-04 08:51:39.641158 \n", - "4 2024-01-30 08:27:36.641158 \n", + "0 2024-01-08 07:10:52.429358 \n", + "1 2023-12-12 05:06:42.429358 \n", + "2 2023-12-22 05:42:42.429358 \n", + "3 2024-02-07 03:27:29.429358 \n", + "4 2024-02-02 03:03:26.429358 \n", "... ... \n", - "99992 2024-02-15 05:01:13.641158 \n", - "99993 2024-01-19 12:07:38.641158 \n", - "99994 2024-01-03 12:28:30.641158 \n", - "99995 2023-12-12 16:02:27.641158 \n", - "99996 2024-02-08 17:19:20.641158 \n", + "99992 2024-02-17 23:37:03.429358 \n", + "99993 2024-01-22 06:43:28.429358 \n", + "99994 2024-01-06 07:04:20.429358 \n", + "99995 2023-12-15 10:38:17.429358 \n", + "99996 2024-02-11 11:55:10.429358 \n", "\n", "[99997 rows x 5 columns]" ] @@ -491,27 +491,6 @@ { "cell_type": "code", "execution_count": 12, - "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", - "metadata": {}, - "outputs": [], - "source": [ - "# for key, val in factorize_key.items():\n", - "# factorize_key[key] = str(val)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ea2ebdd5", - "metadata": {}, - "outputs": [], - "source": [ - "# data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", "metadata": {}, "outputs": [], @@ -519,34 +498,6 @@ "data['transaction_id']= data.reset_index().index " ] }, - { - "cell_type": "code", - "execution_count": 16, - "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# # Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", - "# from utils import update_timestamps\n", - "# data = update_timestamps(data)\n", - "# data" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "3f47156f-64f1-40bd-801a-58136e2a25cb", - "metadata": {}, - "outputs": [], - "source": [ - "# main_categories = list(factorize_key.keys())\n", - "# part_categories = main_categories[:3] \n", - "# part_data = data[data['transaction_category'].isin(part_categories)][:9999]\n", - "# part_data" - ] - }, { "cell_type": "markdown", "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", @@ -569,21 +520,44 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", "metadata": {}, "outputs": [], "source": [ - "def calculate_category_distance(event):\n", - " category = event['transaction_category']\n", + "# move category to the first column to match sagemaker label train convention\"\n", + "def pop_and_move_to_start(d, key):\n", + " # Pop the item if it exists, otherwise return None\n", + " value = d.pop(key, None)\n", + " if value is not None:\n", + " # Move the popped item to the start\n", + " d = {key: value, **d}\n", + " return d\n", + "\n", + "def calculate_category_distance(event): \n", + " column_name ='transaction_category_mapped'\n", + " #print(type(event))\n", + " #print(event)\n", + " event = pop_and_move_to_start(event,column_name)\n", + " #print(event)\n", + " category = event[column_name]\n", " #event[category+'distance'] = abs(event['amount']-event[category+'_avg_1d'])\n", " event['distance'] = abs(event['amount']/2)\n", + " \n", " return event" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, + "id": "7d236f74-c850-4ff0-a8ee-e4a89966ddb1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, "id": "4101c303-2da3-431b-9375-9fa1747070af", "metadata": {}, "outputs": [ @@ -596,11 +570,11 @@ "\n", "\n", - "\n", + "\n", "\n", "mlrun-flow\n", - "\n", + "\n", "\n", "\n", "_start\n", @@ -619,88 +593,88 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", - "OneHotEncoder\n", - "\n", - "OneHotEncoder\n", + "MapValues\n", + "\n", + "MapValues\n", "\n", - "\n", + "\n", "\n", - "DateExtractor->OneHotEncoder\n", - "\n", - "\n", + "DateExtractor->MapValues\n", + "\n", + "\n", "\n", "\n", "\n", "Aggregates\n", - "\n", - "Aggregates\n", + "\n", + "Aggregates\n", "\n", - "\n", + "\n", "\n", - "OneHotEncoder->Aggregates\n", - "\n", - "\n", + "MapValues->Aggregates\n", + "\n", + "\n", "\n", "\n", "\n", "calculate_category_distance\n", - "\n", - "calculate_category_distance\n", + "\n", + "calculate_category_distance\n", "\n", "\n", "\n", "Aggregates->calculate_category_distance\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "DropFeatures\n", - "\n", - "DropFeatures\n", + "\n", + "DropFeatures\n", "\n", "\n", "\n", "calculate_category_distance->DropFeatures\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "parquet/parquet\n", - "\n", - "\n", - "parquet\n", + "\n", + "\n", + "parquet\n", "\n", "\n", "\n", "DropFeatures->parquet/parquet\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "nosql/nosql\n", - "\n", - "\n", - "nosql\n", + "\n", + "\n", + "nosql\n", "\n", "\n", "\n", "DropFeatures->nosql/nosql\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 19, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -715,7 +689,7 @@ "#main_categories = part_categories\n", "\n", "# One Hot Encode the newly defined mappings\n", - "one_hot_encoder_mapping = {'category': main_categories}\n", + "#one_hot_encoder_mapping = {'transaction_category': main_categories}\n", "\n", "# creating feature set\n", "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", @@ -724,17 +698,13 @@ "# setting up the graph\n", "extended_transactions_set.graph \\\n", " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp')) \\\n", - " .to(OneHotEncoder(mapping=one_hot_encoder_mapping))\n", - "extended_transactions_set.add_aggregation(name='amount',column='amount',operations=['avg'],windows=['1d'],period='1h')\n", + " .to(MapValues({'transaction_category' : factorize_key}, with_original_features=True)) \\\n", "\n", - "# # Add the category aggregations over a 14 day window\n", - "# for category in main_categories:\n", - "# extended_transactions_set.add_aggregation(name=category,column=f'category_{category}',\n", - "# operations=['avg'], windows=['1d'])\n", + "extended_transactions_set.add_aggregation(name='amount',column='amount',operations=['avg'],windows=['1d'],period='1h')\n", "\n", "extended_transactions_set.graph \\\n", " .to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates') \\\n", - " .to(DropFeatures(features=['timestamp']))\n", + " .to(DropFeatures(features=['timestamp','transaction_category']))\n", "\n", "\n", "extended_transactions_set.set_targets()\n", @@ -744,17 +714,10 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", + "execution_count": 15, + "id": "53eb2151-447a-4eb0-be7f-a07f1cbea32d", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-08 17:20:30,917 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" - ] - }, { "data": { "text/html": [ @@ -776,110 +739,59 @@ " \n", "
\n", " \n", - " amount_avg_1d\n", " transaction_category\n", " receiver_id\n", " sender_id\n", " amount\n", - " timestamp_year\n", - " timestamp_month\n", - " timestamp_day\n", - " timestamp_hour\n", - " timestamp_minute\n", - " timestamp_second\n", - " distance\n", - "
\n", - "
\n", + " timestamp\n", " transaction_id\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", "
\n", "
\n", " \n", "
\n", - " 0\n", - " 833.26\n", + " 3\n", " Uncategorized\n", " 4518551904499919\n", - " 4333582346477646\n", - " 833.26\n", - " 2024\n", - " 1\n", - " 5\n", - " 12\n", - " 35\n", - " 2\n", - " 416.630\n", + " 4457298962882528\n", + " 879.78\n", + " 2024-02-07 03:27:29.429358\n", + " 3\n", "
\n", "
\n", - " 1\n", - " 596.63\n", + " 7\n", " Uncategorized\n", - " 4518551904499919\n", - " 4642413144038776\n", - " 596.63\n", - " 2023\n", - " 12\n", - " 9\n", - " 10\n", - " 30\n", - " 52\n", - " 298.315\n", + " 4757951915669080\n", + " 4655296518888015\n", + " 801.22\n", + " 2023-12-10 05:59:51.429358\n", + " 7\n", "
\n", "
\n", - " 2\n", - " 176.76\n", + " 11\n", " Uncategorized\n", - " 4274544022939522\n", - " 4952665515556751\n", - " 176.76\n", - " 2023\n", - " 12\n", - " 19\n", + " 4518551904499919\n", + " 4910949333064003\n", + " 423.31\n", + " 2024-01-16 23:31:23.429358\n", " 11\n", - " 6\n", - " 52\n", - " 88.380\n", "
\n", "
\n", - " 3\n", - " 879.78\n", + " 15\n", " Uncategorized\n", " 4518551904499919\n", - " 4457298962882528\n", - " 879.78\n", - " 2024\n", - " 2\n", - " 4\n", - " 8\n", - " 51\n", - " 39\n", - " 439.890\n", + " 4415760195692405\n", + " 382.73\n", + " 2023-11-20 06:23:53.429358\n", + " 15\n", "
\n", "
\n", - " 4\n", - " 742.25\n", + " 19\n", " Uncategorized\n", - " 4601853246125220\n", - " 4578126462896710\n", - " 742.25\n", - " 2024\n", - " 1\n", - " 30\n", - " 8\n", - " 27\n", - " 36\n", - " 371.125\n", + " 4098088980692974\n", + " 4412940106031926\n", + " 111.77\n", + " 2023-11-06 03:05:59.429358\n", + " 19\n", "
\n", "
\n", " ...\n", @@ -889,167 +801,113 @@ " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", "
\n", "
\n", - " 99992\n", - " 205.43\n", + " 99979\n", " Pension and insurances\n", - " 4405008355220324\n", - " 4583355906735225\n", - " 205.43\n", - " 2024\n", - " 2\n", - " 15\n", - " 5\n", - " 1\n", - " 13\n", - " 102.715\n", + " 4179606860088849\n", + " 4359198069543354\n", + " 302.10\n", + " 2024-01-09 19:01:59.429358\n", + " 99979\n", "
\n", "
\n", - " 99993\n", - " 151.49\n", + " 99983\n", " Pension and insurances\n", - " 4300416744511335\n", - " 4949240916846171\n", - " 151.49\n", - " 2024\n", - " 1\n", - " 19\n", - " 12\n", - " 7\n", - " 38\n", - " 75.745\n", + " 4751538620733305\n", + " 4021524999937895\n", + " 115.89\n", + " 2024-02-05 05:15:44.429358\n", + " 99983\n", "
\n", "
\n", - " 99994\n", - " 188.28\n", + " 99987\n", " Pension and insurances\n", " 4405008355220324\n", - " 4996896020767264\n", - " 188.28\n", - " 2024\n", - " 1\n", - " 3\n", - " 12\n", - " 28\n", - " 30\n", - " 94.140\n", + " 4165276502284291\n", + " 207.08\n", + " 2023-12-14 12:49:32.429358\n", + " 99987\n", + "
\n", + "
\n", + " 99991\n", + " Pension and insurances\n", + " 4092115788877543\n", + " 4328901131757235\n", + " 355.58\n", + " 2024-02-20 06:08:09.429358\n", + " 99991\n", "
\n", "
\n", " 99995\n", - " 204.26\n", " Pension and insurances\n", " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2023\n", - " 12\n", - " 12\n", - " 16\n", - " 2\n", - " 27\n", - " 102.130\n", - "
\n", - "
\n", - " 99996\n", - " 207.92\n", - " Pension and insurances\n", - " 4627516674144704\n", - " 4250420705087194\n", - " 207.92\n", - " 2024\n", - " 2\n", - " 8\n", - " 17\n", - " 19\n", - " 20\n", - " 103.960\n", + " 2023-12-15 10:38:17.429358\n", + " 99995\n", "
\n", "
\n", "\n", - "

99997 rows × 12 columns

\n", + "

24999 rows × 6 columns

\n", "" ], "text/plain": [ - " amount_avg_1d transaction_category receiver_id \\\n", - "transaction_id \n", - "0 833.26 Uncategorized 4518551904499919 \n", - "1 596.63 Uncategorized 4518551904499919 \n", - "2 176.76 Uncategorized 4274544022939522 \n", - "3 879.78 Uncategorized 4518551904499919 \n", - "4 742.25 Uncategorized 4601853246125220 \n", - "... ... ... ... \n", - "99992 205.43 Pension and insurances 4405008355220324 \n", - "99993 151.49 Pension and insurances 4300416744511335 \n", - "99994 188.28 Pension and insurances 4405008355220324 \n", - "99995 204.26 Pension and insurances 4262047194499006 \n", - "99996 207.92 Pension and insurances 4627516674144704 \n", - "\n", - " sender_id amount timestamp_year timestamp_month \\\n", - "transaction_id \n", - "0 4333582346477646 833.26 2024 1 \n", - "1 4642413144038776 596.63 2023 12 \n", - "2 4952665515556751 176.76 2023 12 \n", - "3 4457298962882528 879.78 2024 2 \n", - "4 4578126462896710 742.25 2024 1 \n", - "... ... ... ... ... \n", - "99992 4583355906735225 205.43 2024 2 \n", - "99993 4949240916846171 151.49 2024 1 \n", - "99994 4996896020767264 188.28 2024 1 \n", - "99995 4017367486513464 204.26 2023 12 \n", - "99996 4250420705087194 207.92 2024 2 \n", - "\n", - " timestamp_day timestamp_hour timestamp_minute \\\n", - "transaction_id \n", - "0 5 12 35 \n", - "1 9 10 30 \n", - "2 19 11 6 \n", - "3 4 8 51 \n", - "4 30 8 27 \n", - "... ... ... ... \n", - "99992 15 5 1 \n", - "99993 19 12 7 \n", - "99994 3 12 28 \n", - "99995 12 16 2 \n", - "99996 8 17 19 \n", + " transaction_category receiver_id sender_id amount \\\n", + "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "7 Uncategorized 4757951915669080 4655296518888015 801.22 \n", + "11 Uncategorized 4518551904499919 4910949333064003 423.31 \n", + "15 Uncategorized 4518551904499919 4415760195692405 382.73 \n", + "19 Uncategorized 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... ... \n", + "99979 Pension and insurances 4179606860088849 4359198069543354 302.10 \n", + "99983 Pension and insurances 4751538620733305 4021524999937895 115.89 \n", + "99987 Pension and insurances 4405008355220324 4165276502284291 207.08 \n", + "99991 Pension and insurances 4092115788877543 4328901131757235 355.58 \n", + "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", - " timestamp_second distance \n", - "transaction_id \n", - "0 2 416.630 \n", - "1 52 298.315 \n", - "2 52 88.380 \n", - "3 39 439.890 \n", - "4 36 371.125 \n", - "... ... ... \n", - "99992 13 102.715 \n", - "99993 38 75.745 \n", - "99994 30 94.140 \n", - "99995 27 102.130 \n", - "99996 20 103.960 \n", + " timestamp transaction_id \n", + "3 2024-02-07 03:27:29.429358 3 \n", + "7 2023-12-10 05:59:51.429358 7 \n", + "11 2024-01-16 23:31:23.429358 11 \n", + "15 2023-11-20 06:23:53.429358 15 \n", + "19 2023-11-06 03:05:59.429358 19 \n", + "... ... ... \n", + "99979 2024-01-09 19:01:59.429358 99979 \n", + "99983 2024-02-05 05:15:44.429358 99983 \n", + "99987 2023-12-14 12:49:32.429358 99987 \n", + "99991 2024-02-20 06:08:09.429358 99991 \n", + "99995 2023-12-15 10:38:17.429358 99995 \n", "\n", - "[99997 rows x 12 columns]" + "[24999 rows x 6 columns]" ] }, - "execution_count": 20, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ingested_data = extended_transactions_set.ingest(data, overwrite=True)\n", - "ingested_data" + "# Keeping every second row\n", + "df_kept = data.iloc[::2]\n", + "\n", + "# Or, to explicitly remove every second row (the opposite selection)\n", + "df_removed = data.drop(data.index[::2])\n", + "\n", + "\n", + "# Keeping every second row\n", + "df_kept = df_removed.iloc[::2]\n", + "\n", + "# Or, to explicitly remove every second row (the opposite selection)\n", + "df_removed_v2 = df_removed.drop(df_removed.index[::2])\n", + "\n", + "df_removed_v2" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", + "execution_count": 26, + "id": "06c03ea5-8394-44ff-b81d-755e1c244269", "metadata": {}, "outputs": [ { @@ -1074,109 +932,292 @@ " \n", " \n", " transaction_category\n", - " amount_avg_1d\n", " receiver_id\n", " sender_id\n", " amount\n", - " timestamp_year\n", - " timestamp_month\n", - " timestamp_day\n", - " timestamp_hour\n", - " timestamp_minute\n", - " timestamp_second\n", - " distance\n", - " \n", - " \n", + " timestamp\n", " transaction_id\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " 0\n", + " 3\n", " Uncategorized\n", - " 833.26\n", " 4518551904499919\n", - " 4333582346477646\n", - " 833.26\n", - " 2024\n", - " 1\n", - " 5\n", - " 12\n", - " 35\n", - " 2\n", - " 416.630\n", + " 4457298962882528\n", + " 879.78\n", + " 2024-04-05 04:48:41.521043\n", + " 3\n", " \n", " \n", - " 1\n", + " 7\n", " Uncategorized\n", - " 596.63\n", - " 4518551904499919\n", - " 4642413144038776\n", - " 596.63\n", - " 2023\n", - " 12\n", - " 9\n", - " 10\n", - " 30\n", - " 52\n", - " 298.315\n", + " 4757951915669080\n", + " 4655296518888015\n", + " 801.22\n", + " 2024-02-06 07:21:03.521043\n", + " 7\n", " \n", " \n", - " 2\n", + " 11\n", " Uncategorized\n", - " 176.76\n", - " 4274544022939522\n", - " 4952665515556751\n", - " 176.76\n", - " 2023\n", - " 12\n", - " 19\n", + " 4518551904499919\n", + " 4910949333064003\n", + " 423.31\n", + " 2024-03-15 00:52:35.521043\n", " 11\n", - " 6\n", - " 52\n", - " 88.380\n", " \n", " \n", - " 3\n", + " 15\n", " Uncategorized\n", - " 879.78\n", " 4518551904499919\n", - " 4457298962882528\n", - " 879.78\n", - " 2024\n", - " 2\n", - " 4\n", - " 8\n", - " 51\n", - " 39\n", - " 439.890\n", + " 4415760195692405\n", + " 382.73\n", + " 2024-01-17 07:45:05.521043\n", + " 15\n", " \n", " \n", - " 4\n", + " 19\n", " Uncategorized\n", - " 742.25\n", - " 4601853246125220\n", - " 4578126462896710\n", - " 742.25\n", + " 4098088980692974\n", + " 4412940106031926\n", + " 111.77\n", + " 2024-01-03 04:27:11.521043\n", + " 19\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 99979\n", + " Pension and insurances\n", + " 4179606860088849\n", + " 4359198069543354\n", + " 302.10\n", + " 2024-03-07 20:23:11.521043\n", + " 99979\n", + " \n", + " \n", + " 99983\n", + " Pension and insurances\n", + " 4751538620733305\n", + " 4021524999937895\n", + " 115.89\n", + " 2024-04-03 06:36:56.521043\n", + " 99983\n", + " \n", + " \n", + " 99987\n", + " Pension and insurances\n", + " 4405008355220324\n", + " 4165276502284291\n", + " 207.08\n", + " 2024-02-10 14:10:44.521043\n", + " 99987\n", + " \n", + " \n", + " 99991\n", + " Pension and insurances\n", + " 4092115788877543\n", + " 4328901131757235\n", + " 355.58\n", + " 2024-04-18 07:29:21.521043\n", + " 99991\n", + " \n", + " \n", + " 99995\n", + " Pension and insurances\n", + " 4262047194499006\n", + " 4017367486513464\n", + " 204.26\n", + " 2024-02-11 11:59:29.521043\n", + " 99995\n", + " \n", + " \n", + "\n", + "

24999 rows × 6 columns

\n", + "" + ], + "text/plain": [ + " transaction_category receiver_id sender_id amount \\\n", + "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "7 Uncategorized 4757951915669080 4655296518888015 801.22 \n", + "11 Uncategorized 4518551904499919 4910949333064003 423.31 \n", + "15 Uncategorized 4518551904499919 4415760195692405 382.73 \n", + "19 Uncategorized 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... ... \n", + "99979 Pension and insurances 4179606860088849 4359198069543354 302.10 \n", + "99983 Pension and insurances 4751538620733305 4021524999937895 115.89 \n", + "99987 Pension and insurances 4405008355220324 4165276502284291 207.08 \n", + "99991 Pension and insurances 4092115788877543 4328901131757235 355.58 \n", + "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", + "\n", + " timestamp transaction_id \n", + "3 2024-04-05 04:48:41.521043 3 \n", + "7 2024-02-06 07:21:03.521043 7 \n", + "11 2024-03-15 00:52:35.521043 11 \n", + "15 2024-01-17 07:45:05.521043 15 \n", + "19 2024-01-03 04:27:11.521043 19 \n", + "... ... ... \n", + "99979 2024-03-07 20:23:11.521043 99979 \n", + "99983 2024-04-03 06:36:56.521043 99983 \n", + "99987 2024-02-10 14:10:44.521043 99987 \n", + "99991 2024-04-18 07:29:21.521043 99991 \n", + "99995 2024-02-11 11:59:29.521043 99995 \n", + "\n", + "[24999 rows x 6 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import update_timestamps\n", + "df_removed_v2=update_timestamps(df_removed_v2)\n", + "df_removed_v2" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1194,221 +1235,904 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_id
30879.7845185519044999194457298962882528879.7820244544841439.890
70801.2247579519156690804655296518888015801.222024267213400.610
110423.3145185519044999194910949333064003423.31202431505235211.655
150382.7345185519044999194415760195692405382.7320241177455191.365
190111.7740980889806929744412940106031926111.7720241308342736371.1251155.885
......
99992Pension and insurances205.4344050083552203244583355906735225205.439997918302.1041796068600888494359198069543354302.1020242155113102.71537202311151.050
99993Pension and insurances151.4943004167445113354949240916846171151.499998318115.8947515386207333054021524999937895115.8920241191273875.745436365657.945
99994Pension and insurances188.289998718207.0844050083552203244996896020767264188.284165276502284291207.0820241312283094.140210141044103.540
9999118355.5840921157888775434328901131757235355.58202441872921177.790
99995Pension and insurances18204.2642620471944990064017367486513464204.262023121216227102.130
99996Pension and insurances207.9246275166741447044250420705087194207.92202428171920103.96011115929102.130
\n", - "

99997 rows × 12 columns

\n", + "

24999 rows × 12 columns

\n", "
" ], "text/plain": [ - " transaction_category amount_avg_1d receiver_id \\\n", - "transaction_id \n", - "0 Uncategorized 833.26 4518551904499919 \n", - "1 Uncategorized 596.63 4518551904499919 \n", - "2 Uncategorized 176.76 4274544022939522 \n", - "3 Uncategorized 879.78 4518551904499919 \n", - "4 Uncategorized 742.25 4601853246125220 \n", - "... ... ... ... \n", - "99992 Pension and insurances 205.43 4405008355220324 \n", - "99993 Pension and insurances 151.49 4300416744511335 \n", - "99994 Pension and insurances 188.28 4405008355220324 \n", - "99995 Pension and insurances 204.26 4262047194499006 \n", - "99996 Pension and insurances 207.92 4627516674144704 \n", + " transaction_category_mapped amount_avg_1d receiver_id \\\n", + "transaction_id \n", + "3 0 879.78 4518551904499919 \n", + "7 0 801.22 4757951915669080 \n", + "11 0 423.31 4518551904499919 \n", + "15 0 382.73 4518551904499919 \n", + "19 0 111.77 4098088980692974 \n", + "... ... ... ... \n", + "99979 18 302.10 4179606860088849 \n", + "99983 18 115.89 4751538620733305 \n", + "99987 18 207.08 4405008355220324 \n", + "99991 18 355.58 4092115788877543 \n", + "99995 18 204.26 4262047194499006 \n", "\n", " sender_id amount timestamp_year timestamp_month \\\n", "transaction_id \n", - "0 4333582346477646 833.26 2024 1 \n", - "1 4642413144038776 596.63 2023 12 \n", - "2 4952665515556751 176.76 2023 12 \n", - "3 4457298962882528 879.78 2024 2 \n", - "4 4578126462896710 742.25 2024 1 \n", + "3 4457298962882528 879.78 2024 4 \n", + "7 4655296518888015 801.22 2024 2 \n", + "11 4910949333064003 423.31 2024 3 \n", + "15 4415760195692405 382.73 2024 1 \n", + "19 4412940106031926 111.77 2024 1 \n", "... ... ... ... ... \n", - "99992 4583355906735225 205.43 2024 2 \n", - "99993 4949240916846171 151.49 2024 1 \n", - "99994 4996896020767264 188.28 2024 1 \n", - "99995 4017367486513464 204.26 2023 12 \n", - "99996 4250420705087194 207.92 2024 2 \n", + "99979 4359198069543354 302.10 2024 3 \n", + "99983 4021524999937895 115.89 2024 4 \n", + "99987 4165276502284291 207.08 2024 2 \n", + "99991 4328901131757235 355.58 2024 4 \n", + "99995 4017367486513464 204.26 2024 2 \n", "\n", " timestamp_day timestamp_hour timestamp_minute \\\n", "transaction_id \n", - "0 5 12 35 \n", - "1 9 10 30 \n", - "2 19 11 6 \n", - "3 4 8 51 \n", - "4 30 8 27 \n", + "3 5 4 48 \n", + "7 6 7 21 \n", + "11 15 0 52 \n", + "15 17 7 45 \n", + "19 3 4 27 \n", "... ... ... ... \n", - "99992 15 5 1 \n", - "99993 19 12 7 \n", - "99994 3 12 28 \n", - "99995 12 16 2 \n", - "99996 8 17 19 \n", + "99979 7 20 23 \n", + "99983 3 6 36 \n", + "99987 10 14 10 \n", + "99991 18 7 29 \n", + "99995 11 11 59 \n", "\n", " timestamp_second distance \n", "transaction_id \n", - "0 2 416.630 \n", - "1 52 298.315 \n", - "2 52 88.380 \n", - "3 39 439.890 \n", - "4 36 371.125 \n", + "3 41 439.890 \n", + "7 3 400.610 \n", + "11 35 211.655 \n", + "15 5 191.365 \n", + "19 11 55.885 \n", "... ... ... \n", - "99992 13 102.715 \n", - "99993 38 75.745 \n", - "99994 30 94.140 \n", - "99995 27 102.130 \n", - "99996 20 103.960 \n", + "99979 11 151.050 \n", + "99983 56 57.945 \n", + "99987 44 103.540 \n", + "99991 21 177.790 \n", + "99995 29 102.130 \n", "\n", - "[99997 rows x 12 columns]" + "[24999 rows x 12 columns]" ] }, - "execution_count": 21, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#data = ingested_data.reset_index(drop=True)\n", - "data = ingested_data\n", - "data = data[['transaction_category'] + [col for col in data.columns if col != 'transaction_category']]\n", - "data" + "ingested_data = extended_transactions_set.ingest(df_removed_v2, overwrite=True)\n", + "ingested_data" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "e1d377a5-cf7e-4564-8e14-10bfbaca4da2", + "execution_count": 28, + "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "['transaction_category',\n", - " 'amount_avg_1d',\n", - " 'receiver_id',\n", - " 'sender_id',\n", - " 'amount',\n", - " 'timestamp_year',\n", - " 'timestamp_month',\n", - " 'timestamp_day',\n", - " 'timestamp_hour',\n", - " 'timestamp_minute',\n", - " 'timestamp_second',\n", - " 'distance']" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_id
30879.7845185519044999194457298962882528879.7820244544841439.890
70801.2247579519156690804655296518888015801.222024267213400.610
110423.3145185519044999194910949333064003423.31202431505235211.655
150382.7345185519044999194415760195692405382.7320241177455191.365
190111.7740980889806929744412940106031926111.772024134271155.885
.......................................
9997918302.1041796068600888494359198069543354302.10202437202311151.050
9998318115.8947515386207333054021524999937895115.892024436365657.945
9998718207.0844050083552203244165276502284291207.082024210141044103.540
9999118355.5840921157888775434328901131757235355.58202441872921177.790
9999518204.2642620471944990064017367486513464204.262024211115929102.130
\n", + "

24999 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category_mapped amount_avg_1d receiver_id \\\n", + "transaction_id \n", + "3 0 879.78 4518551904499919 \n", + "7 0 801.22 4757951915669080 \n", + "11 0 423.31 4518551904499919 \n", + "15 0 382.73 4518551904499919 \n", + "19 0 111.77 4098088980692974 \n", + "... ... ... ... \n", + "99979 18 302.10 4179606860088849 \n", + "99983 18 115.89 4751538620733305 \n", + "99987 18 207.08 4405008355220324 \n", + "99991 18 355.58 4092115788877543 \n", + "99995 18 204.26 4262047194499006 \n", + "\n", + " sender_id amount timestamp_year timestamp_month \\\n", + "transaction_id \n", + "3 4457298962882528 879.78 2024 4 \n", + "7 4655296518888015 801.22 2024 2 \n", + "11 4910949333064003 423.31 2024 3 \n", + "15 4415760195692405 382.73 2024 1 \n", + "19 4412940106031926 111.77 2024 1 \n", + "... ... ... ... ... \n", + "99979 4359198069543354 302.10 2024 3 \n", + "99983 4021524999937895 115.89 2024 4 \n", + "99987 4165276502284291 207.08 2024 2 \n", + "99991 4328901131757235 355.58 2024 4 \n", + "99995 4017367486513464 204.26 2024 2 \n", + "\n", + " timestamp_day timestamp_hour timestamp_minute \\\n", + "transaction_id \n", + "3 5 4 48 \n", + "7 6 7 21 \n", + "11 15 0 52 \n", + "15 17 7 45 \n", + "19 3 4 27 \n", + "... ... ... ... \n", + "99979 7 20 23 \n", + "99983 3 6 36 \n", + "99987 10 14 10 \n", + "99991 18 7 29 \n", + "99995 11 11 59 \n", + "\n", + " timestamp_second distance \n", + "transaction_id \n", + "3 41 439.890 \n", + "7 3 400.610 \n", + "11 35 211.655 \n", + "15 5 191.365 \n", + "19 11 55.885 \n", + "... ... ... \n", + "99979 11 151.050 \n", + "99983 56 57.945 \n", + "99987 44 103.540 \n", + "99991 21 177.790 \n", + "99995 29 102.130 \n", + "\n", + "[24999 rows x 12 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#data = ingested_data.reset_index(drop=True)\n", + "data = ingested_data\n", + "#data = data[['transaction_category'] + [col for col in data.columns if col != 'transaction_category']]\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e1d377a5-cf7e-4564-8e14-10bfbaca4da2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['transaction_category_mapped',\n", + " 'amount_avg_1d',\n", + " 'receiver_id',\n", + " 'sender_id',\n", + " 'amount',\n", + " 'timestamp_year',\n", + " 'timestamp_month',\n", + " 'timestamp_day',\n", + " 'timestamp_hour',\n", + " 'timestamp_minute',\n", + " 'timestamp_second',\n", + " 'distance']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_cols = list(data.columns)\n", + "data_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", + "metadata": {}, + "outputs": [], + "source": [ + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", + "\n", + "# create feature vector on top of aggreagations\n", + "# Define the list of features we will be using\n", + "features = [f\"transactions.{name}\" for name in data_cols[1:]] \n", + "\n", + "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'transactions-vector'\n", + "\n", + "# Define the feature vector using our Feature Store (fstore)\n", + "transactions_fv = fstore.FeatureVector(fv_name, \n", + " features,\n", + " label_feature='transactions.transaction_category_mapped',\n", + " description='stocks information')\n", + "\n", + "#label_feature = 'transactions-v2.transaction_category',\n", + "# Save the feature vector in the Feature Store\n", + "transactions_fv.save()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "eb69d9fa-22a9-4b9f-9443-d00d9190ad55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amount_avg_1dreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistancetransaction_category_mapped
0879.7845185519044999194457298962882528879.7820244544841439.8900
1801.2247579519156690804655296518888015801.222024267213400.6100
2423.3145185519044999194910949333064003423.31202431505235211.6550
3382.7345185519044999194415760195692405382.7320241177455191.3650
4111.7740980889806929744412940106031926111.772024134271155.8850
.......................................
24994302.1041796068600888494359198069543354302.10202437202311151.05018
24995115.8947515386207333054021524999937895115.892024436365657.94518
24996207.0844050083552203244165276502284291207.082024210141044103.54018
24997355.5840921157888775434328901131757235355.58202441872921177.79018
24998204.2642620471944990064017367486513464204.262024211115929102.13018
\n", + "

24999 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " amount_avg_1d receiver_id sender_id amount \\\n", + "0 879.78 4518551904499919 4457298962882528 879.78 \n", + "1 801.22 4757951915669080 4655296518888015 801.22 \n", + "2 423.31 4518551904499919 4910949333064003 423.31 \n", + "3 382.73 4518551904499919 4415760195692405 382.73 \n", + "4 111.77 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... ... \n", + "24994 302.10 4179606860088849 4359198069543354 302.10 \n", + "24995 115.89 4751538620733305 4021524999937895 115.89 \n", + "24996 207.08 4405008355220324 4165276502284291 207.08 \n", + "24997 355.58 4092115788877543 4328901131757235 355.58 \n", + "24998 204.26 4262047194499006 4017367486513464 204.26 \n", + "\n", + " timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", + "0 2024 4 5 4 \n", + "1 2024 2 6 7 \n", + "2 2024 3 15 0 \n", + "3 2024 1 17 7 \n", + "4 2024 1 3 4 \n", + "... ... ... ... ... \n", + "24994 2024 3 7 20 \n", + "24995 2024 4 3 6 \n", + "24996 2024 2 10 14 \n", + "24997 2024 4 18 7 \n", + "24998 2024 2 11 11 \n", + "\n", + " timestamp_minute timestamp_second distance \\\n", + "0 48 41 439.890 \n", + "1 21 3 400.610 \n", + "2 52 35 211.655 \n", + "3 45 5 191.365 \n", + "4 27 11 55.885 \n", + "... ... ... ... \n", + "24994 23 11 151.050 \n", + "24995 36 56 57.945 \n", + "24996 10 44 103.540 \n", + "24997 29 21 177.790 \n", + "24998 59 29 102.130 \n", + "\n", + " transaction_category_mapped \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "24994 18 \n", + "24995 18 \n", + "24996 18 \n", + "24997 18 \n", + "24998 18 \n", + "\n", + "[24999 rows x 12 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fs\n", + "resp = transactions_fv.get_offline_features()\n", + "#Preview the dataset\n", + "fv_data = resp.to_dataframe()\n", + "fv_data" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "cb156ebe-9846-4ff3-a388-92362df7c741", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'amount_avg_1d': 10.45,\n", + " 'receiver_id': 4365439229004487,\n", + " 'sender_id': 4895135853273971,\n", + " 'amount': 10.45,\n", + " 'timestamp_year': 2024,\n", + " 'timestamp_month': 3,\n", + " 'timestamp_day': 20,\n", + " 'timestamp_hour': 4,\n", + " 'timestamp_minute': 33,\n", + " 'timestamp_second': 58,\n", + " 'distance': 5.225}]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svc = transactions_fv.get_online_feature_service()\n", + "resp = svc.get([{\"transaction_id\": \"24995\"}])\n", + "resp" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "7737865e-21a1-4bfe-b24a-29925145280f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[10.45,\n", + " 4365439229004487,\n", + " 4895135853273971,\n", + " 10.45,\n", + " 2024,\n", + " 3,\n", + " 20,\n", + " 4,\n", + " 33,\n", + " 58,\n", + " 5.225]]" ] }, - "execution_count": 22, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data_cols = list(data.columns)\n", - "data_cols" + "svc = transactions_fv.get_online_feature_service()\n", + "resp = svc.get([{\"transaction_id\": \"24995\"}],as_list=True)\n", + "resp" ] }, { - "cell_type": "code", - "execution_count": 23, - "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", + "cell_type": "markdown", + "id": "b5e4834e", "metadata": {}, - "outputs": [], "source": [ - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "# create feature vector on top of aggreagations\n", - "# Define the list of features we will be using\n", - "features = [f\"transactions.{name}\" for name in data_cols] \n", + "We update the values in the feature store with the real values of our data" + ] + }, + { + "cell_type": "markdown", + "id": "e2f6395f", + "metadata": {}, + "source": [ + "And display them after getting them from the feature store" + ] + }, + { + "cell_type": "markdown", + "id": "cf148985", + "metadata": {}, + "source": [ + "We use the feature store to calculate the distance between the average of every category and the current amount" + ] + }, + { + "cell_type": "markdown", + "id": "289eeca6", + "metadata": {}, + "source": [ + "### 4. Create model \n", + "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'transactions-vector'\n", "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "transactions_fv = fstore.FeatureVector(fv_name, \n", - " features, \n", - " description='stocks information')\n", "\n", - "#label_feature = 'transactions-v2.transaction_category',\n", - "# Save the feature vector in the Feature Store\n", - "transactions_fv.save()" + "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." ] }, { "cell_type": "code", - "execution_count": 29, - "id": "7737865e-21a1-4bfe-b24a-29925145280f", + "execution_count": 38, + "id": "1cbb00b5-46bf-4a20-aad9-a03716ab97ae", "metadata": {}, "outputs": [ { @@ -1432,7 +2156,7 @@ " \n", " \n", " \n", - " transaction_category\n", + " transaction_category_mapped\n", " amount_avg_1d\n", " receiver_id\n", " sender_id\n", @@ -1449,78 +2173,78 @@ " \n", " \n", " 0\n", - " Uncategorized\n", - " 833.26\n", + " 0\n", + " 879.78\n", " 4518551904499919\n", - " 4333582346477646\n", - " 833.26\n", + " 4457298962882528\n", + " 879.78\n", " 2024\n", - " 1\n", + " 4\n", " 5\n", - " 12\n", - " 35\n", - " 2\n", - " 416.630\n", + " 4\n", + " 48\n", + " 41\n", + " 439.890\n", " \n", " \n", " 1\n", - " Uncategorized\n", - " 596.63\n", - " 4518551904499919\n", - " 4642413144038776\n", - " 596.63\n", - " 2023\n", - " 12\n", - " 9\n", - " 10\n", - " 30\n", - " 52\n", - " 298.315\n", + " 0\n", + " 801.22\n", + " 4757951915669080\n", + " 4655296518888015\n", + " 801.22\n", + " 2024\n", + " 2\n", + " 6\n", + " 7\n", + " 21\n", + " 3\n", + " 400.610\n", " \n", " \n", " 2\n", - " Uncategorized\n", - " 176.76\n", - " 4274544022939522\n", - " 4952665515556751\n", - " 176.76\n", - " 2023\n", - " 12\n", - " 19\n", - " 11\n", - " 6\n", + " 0\n", + " 423.31\n", + " 4518551904499919\n", + " 4910949333064003\n", + " 423.31\n", + " 2024\n", + " 3\n", + " 15\n", + " 0\n", " 52\n", - " 88.380\n", + " 35\n", + " 211.655\n", " \n", " \n", " 3\n", - " Uncategorized\n", - " 879.78\n", + " 0\n", + " 382.73\n", " 4518551904499919\n", - " 4457298962882528\n", - " 879.78\n", + " 4415760195692405\n", + " 382.73\n", " 2024\n", - " 2\n", - " 4\n", - " 8\n", - " 51\n", - " 39\n", - " 439.890\n", + " 1\n", + " 17\n", + " 7\n", + " 45\n", + " 5\n", + " 191.365\n", " \n", " \n", " 4\n", - " Uncategorized\n", - " 742.25\n", - " 4601853246125220\n", - " 4578126462896710\n", - " 742.25\n", + " 0\n", + " 111.77\n", + " 4098088980692974\n", + " 4412940106031926\n", + " 111.77\n", " 2024\n", " 1\n", - " 30\n", - " 8\n", + " 3\n", + " 4\n", " 27\n", - " 36\n", - " 371.125\n", + " 11\n", + " 55.885\n", " \n", " \n", " ...\n", @@ -1538,198 +2262,165 @@ " ...\n", " \n", " \n", - " 99992\n", - " Pension and insurances\n", - " 205.43\n", - " 4405008355220324\n", - " 4583355906735225\n", - " 205.43\n", + " 24994\n", + " 18\n", + " 302.10\n", + " 4179606860088849\n", + " 4359198069543354\n", + " 302.10\n", " 2024\n", - " 2\n", - " 15\n", - " 5\n", - " 1\n", - " 13\n", - " 102.715\n", + " 3\n", + " 7\n", + " 20\n", + " 23\n", + " 11\n", + " 151.050\n", " \n", " \n", - " 99993\n", - " Pension and insurances\n", - " 151.49\n", - " 4300416744511335\n", - " 4949240916846171\n", - " 151.49\n", + " 24995\n", + " 18\n", + " 115.89\n", + " 4751538620733305\n", + " 4021524999937895\n", + " 115.89\n", " 2024\n", - " 1\n", - " 19\n", - " 12\n", - " 7\n", - " 38\n", - " 75.745\n", + " 4\n", + " 3\n", + " 6\n", + " 36\n", + " 56\n", + " 57.945\n", " \n", " \n", - " 99994\n", - " Pension and insurances\n", - " 188.28\n", + " 24996\n", + " 18\n", + " 207.08\n", " 4405008355220324\n", - " 4996896020767264\n", - " 188.28\n", + " 4165276502284291\n", + " 207.08\n", " 2024\n", - " 1\n", - " 3\n", - " 12\n", - " 28\n", - " 30\n", - " 94.140\n", + " 2\n", + " 10\n", + " 14\n", + " 10\n", + " 44\n", + " 103.540\n", " \n", " \n", - " 99995\n", - " Pension and insurances\n", + " 24997\n", + " 18\n", + " 355.58\n", + " 4092115788877543\n", + " 4328901131757235\n", + " 355.58\n", + " 2024\n", + " 4\n", + " 18\n", + " 7\n", + " 29\n", + " 21\n", + " 177.790\n", + " \n", + " \n", + " 24998\n", + " 18\n", " 204.26\n", " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2023\n", - " 12\n", - " 12\n", - " 16\n", - " 2\n", - " 27\n", - " 102.130\n", - " \n", - " \n", - " 99996\n", - " Pension and insurances\n", - " 207.92\n", - " 4627516674144704\n", - " 4250420705087194\n", - " 207.92\n", " 2024\n", " 2\n", - " 8\n", - " 17\n", - " 19\n", - " 20\n", - " 103.960\n", + " 11\n", + " 11\n", + " 59\n", + " 29\n", + " 102.130\n", " \n", " \n", "\n", - "

99997 rows × 12 columns

\n", + "

24999 rows × 12 columns

\n", "" ], "text/plain": [ - " transaction_category amount_avg_1d receiver_id \\\n", - "0 Uncategorized 833.26 4518551904499919 \n", - "1 Uncategorized 596.63 4518551904499919 \n", - "2 Uncategorized 176.76 4274544022939522 \n", - "3 Uncategorized 879.78 4518551904499919 \n", - "4 Uncategorized 742.25 4601853246125220 \n", - "... ... ... ... \n", - "99992 Pension and insurances 205.43 4405008355220324 \n", - "99993 Pension and insurances 151.49 4300416744511335 \n", - "99994 Pension and insurances 188.28 4405008355220324 \n", - "99995 Pension and insurances 204.26 4262047194499006 \n", - "99996 Pension and insurances 207.92 4627516674144704 \n", + " transaction_category_mapped amount_avg_1d receiver_id \\\n", + "0 0 879.78 4518551904499919 \n", + "1 0 801.22 4757951915669080 \n", + "2 0 423.31 4518551904499919 \n", + "3 0 382.73 4518551904499919 \n", + "4 0 111.77 4098088980692974 \n", + "... ... ... ... \n", + "24994 18 302.10 4179606860088849 \n", + "24995 18 115.89 4751538620733305 \n", + "24996 18 207.08 4405008355220324 \n", + "24997 18 355.58 4092115788877543 \n", + "24998 18 204.26 4262047194499006 \n", "\n", " sender_id amount timestamp_year timestamp_month \\\n", - "0 4333582346477646 833.26 2024 1 \n", - "1 4642413144038776 596.63 2023 12 \n", - "2 4952665515556751 176.76 2023 12 \n", - "3 4457298962882528 879.78 2024 2 \n", - "4 4578126462896710 742.25 2024 1 \n", + "0 4457298962882528 879.78 2024 4 \n", + "1 4655296518888015 801.22 2024 2 \n", + "2 4910949333064003 423.31 2024 3 \n", + "3 4415760195692405 382.73 2024 1 \n", + "4 4412940106031926 111.77 2024 1 \n", "... ... ... ... ... \n", - "99992 4583355906735225 205.43 2024 2 \n", - "99993 4949240916846171 151.49 2024 1 \n", - "99994 4996896020767264 188.28 2024 1 \n", - "99995 4017367486513464 204.26 2023 12 \n", - "99996 4250420705087194 207.92 2024 2 \n", + "24994 4359198069543354 302.10 2024 3 \n", + "24995 4021524999937895 115.89 2024 4 \n", + "24996 4165276502284291 207.08 2024 2 \n", + "24997 4328901131757235 355.58 2024 4 \n", + "24998 4017367486513464 204.26 2024 2 \n", "\n", " timestamp_day timestamp_hour timestamp_minute timestamp_second \\\n", - "0 5 12 35 2 \n", - "1 9 10 30 52 \n", - "2 19 11 6 52 \n", - "3 4 8 51 39 \n", - "4 30 8 27 36 \n", + "0 5 4 48 41 \n", + "1 6 7 21 3 \n", + "2 15 0 52 35 \n", + "3 17 7 45 5 \n", + "4 3 4 27 11 \n", "... ... ... ... ... \n", - "99992 15 5 1 13 \n", - "99993 19 12 7 38 \n", - "99994 3 12 28 30 \n", - "99995 12 16 2 27 \n", - "99996 8 17 19 20 \n", + "24994 7 20 23 11 \n", + "24995 3 6 36 56 \n", + "24996 10 14 10 44 \n", + "24997 18 7 29 21 \n", + "24998 11 11 59 29 \n", "\n", " distance \n", - "0 416.630 \n", - "1 298.315 \n", - "2 88.380 \n", - "3 439.890 \n", - "4 371.125 \n", + "0 439.890 \n", + "1 400.610 \n", + "2 211.655 \n", + "3 191.365 \n", + "4 55.885 \n", "... ... \n", - "99992 102.715 \n", - "99993 75.745 \n", - "99994 94.140 \n", - "99995 102.130 \n", - "99996 103.960 \n", + "24994 151.050 \n", + "24995 57.945 \n", + "24996 103.540 \n", + "24997 177.790 \n", + "24998 102.130 \n", "\n", - "[99997 rows x 12 columns]" + "[24999 rows x 12 columns]" ] }, - "execution_count": 29, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import mlrun.feature_store as fs\n", - "resp = fs.FeatureVector.get_offline_features(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector:latest\")\n", + "resp = transactions_fv.get_offline_features()\n", "#Preview the dataset\n", "fv_data = resp.to_dataframe()\n", - "fv_data\n", "\n", - "# svc = fs.FeatureVector.get_online_feature_service(\"store://feature-vectors/sagemaker-v3-admin/transactions-vector:latest\")\n", - "# resp = svc.get([{\"transaction_id\": \"99996\"}])\n", - "# resp" - ] - }, - { - "cell_type": "markdown", - "id": "b5e4834e", - "metadata": {}, - "source": [ - "We update the values in the feature store with the real values of our data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" - ] - }, - { - "cell_type": "markdown", - "id": "cf148985", - "metadata": {}, - "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" - ] - }, - { - "cell_type": "markdown", - "id": "289eeca6", - "metadata": {}, - "source": [ - "### 4. Create model \n", - "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", + "column_to_move = 'transaction_category_mapped'\n", "\n", + "new_columns_order = [column_to_move] + [col for col in fv_data.columns if col != column_to_move]\n", + "fv_data = fv_data[new_columns_order]\n", "\n", "\n", - "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." + "data = fv_data\n", + "data" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 39, "id": "47512de3-60ac-49c7-ace8-031959527e86", "metadata": {}, "outputs": [], @@ -1750,7 +2441,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 40, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -1770,7 +2461,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 41, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -1796,7 +2487,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 42, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -1814,7 +2505,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 43, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1837,7 +2528,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 44, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1862,7 +2553,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 45, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1890,7 +2581,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 46, "id": "c24e06fc", "metadata": { "scrolled": true @@ -1900,141 +2591,141 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-08-17-31-38-814\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-12-03-29-991\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-08 17:31:38 Starting - Starting the training job...\n", - "2024-02-08 17:32:02 Starting - Preparing the instances for training.........\n", - "2024-02-08 17:33:23 Downloading - Downloading input data...\n", - "2024-02-08 17:33:53 Downloading - Downloading the training image......\n", - "2024-02-08 17:34:43 Training - Training image download completed. Training in progress.\u001b[34m[2024-02-08 17:35:00.315 ip-10-0-178-178.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-11 12:03:30 Starting - Starting the training job...\n", + "2024-02-11 12:03:45 Starting - Preparing the instances for training......\n", + "2024-02-11 12:04:42 Downloading - Downloading input data...\n", + "2024-02-11 12:05:23 Downloading - Downloading the training image......\n", + "2024-02-11 12:06:08 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-11 12:06:25.555 ip-10-0-155-6.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Train matrix has 69997 rows and 11 columns\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.471 ip-10-0-178-178.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.472 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.472 ip-10-0-178-178.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.473 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.473 ip-10-0-178-178.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-08:17:35:00:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.749 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-08 17:35:00.752 ip-10-0-178-178.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.00000#011validation-merror:0.00000\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Train matrix has 17499 rows and 11 columns\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Validation matrix has 5000 rows\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.675 ip-10-0-155-6.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.676 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.676 ip-10-0-155-6.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.677 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.677 ip-10-0-155-6.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-11:12:06:25:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.55803#011validation-merror:0.57260\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.895 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-11 12:06:25.898 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.54860#011validation-merror:0.57440\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.53409#011validation-merror:0.55980\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.53197#011validation-merror:0.55740\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.52729#011validation-merror:0.55340\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.51700#011validation-merror:0.54420\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.51254#011validation-merror:0.54000\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.50969#011validation-merror:0.53540\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.50351#011validation-merror:0.53400\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.49934#011validation-merror:0.53040\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.49380#011validation-merror:0.52340\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.48403#011validation-merror:0.51520\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.47951#011validation-merror:0.50980\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.47700#011validation-merror:0.50820\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.47111#011validation-merror:0.50620\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.46694#011validation-merror:0.50000\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.46020#011validation-merror:0.49200\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.45723#011validation-merror:0.48980\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.44991#011validation-merror:0.48140\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.44317#011validation-merror:0.47780\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.43831#011validation-merror:0.47340\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.43551#011validation-merror:0.46980\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.42711#011validation-merror:0.45920\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.42134#011validation-merror:0.45440\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.41671#011validation-merror:0.45100\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.41425#011validation-merror:0.44940\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.41157#011validation-merror:0.44800\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.40688#011validation-merror:0.44400\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.40151#011validation-merror:0.43880\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.39785#011validation-merror:0.43400\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.39339#011validation-merror:0.43000\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.38625#011validation-merror:0.42440\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.38214#011validation-merror:0.42200\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.37802#011validation-merror:0.41940\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.37254#011validation-merror:0.41460\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.36773#011validation-merror:0.41160\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.36499#011validation-merror:0.40940\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.35808#011validation-merror:0.40100\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.35574#011validation-merror:0.39940\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.35202#011validation-merror:0.39540\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.35031#011validation-merror:0.39400\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.34813#011validation-merror:0.39160\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.34339#011validation-merror:0.38680\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.33756#011validation-merror:0.38040\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.33545#011validation-merror:0.37880\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.33071#011validation-merror:0.37360\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.32808#011validation-merror:0.36900\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.32608#011validation-merror:0.36620\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.31939#011validation-merror:0.36220\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.31556#011validation-merror:0.35900\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.31396#011validation-merror:0.35520\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.31265#011validation-merror:0.35400\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.31030#011validation-merror:0.35160\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.30447#011validation-merror:0.34620\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.30236#011validation-merror:0.34480\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.29933#011validation-merror:0.34280\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.29693#011validation-merror:0.34120\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.29464#011validation-merror:0.34020\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.28904#011validation-merror:0.33420\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.28664#011validation-merror:0.33260\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.28579#011validation-merror:0.33200\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.28082#011validation-merror:0.32700\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.27813#011validation-merror:0.32440\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.27727#011validation-merror:0.32360\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.27270#011validation-merror:0.32140\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.26813#011validation-merror:0.31800\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.26464#011validation-merror:0.31440\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.26333#011validation-merror:0.31320\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.26076#011validation-merror:0.31100\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.25916#011validation-merror:0.30900\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.25613#011validation-merror:0.30560\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.25510#011validation-merror:0.30640\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.25373#011validation-merror:0.30480\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.25190#011validation-merror:0.30220\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.24950#011validation-merror:0.29860\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.24504#011validation-merror:0.29260\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.24304#011validation-merror:0.29080\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.24099#011validation-merror:0.29060\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.23904#011validation-merror:0.28880\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.23681#011validation-merror:0.28720\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.23584#011validation-merror:0.28460\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.23327#011validation-merror:0.28280\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.23161#011validation-merror:0.28120\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.22864#011validation-merror:0.27860\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.22470#011validation-merror:0.27680\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.22241#011validation-merror:0.27580\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.22087#011validation-merror:0.27440\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.21818#011validation-merror:0.27160\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.21744#011validation-merror:0.27000\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.21584#011validation-merror:0.26800\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.21476#011validation-merror:0.26640\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.21241#011validation-merror:0.26600\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.21161#011validation-merror:0.26440\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.21018#011validation-merror:0.26280\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.20927#011validation-merror:0.26160\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.20641#011validation-merror:0.25820\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.20544#011validation-merror:0.25840\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.20390#011validation-merror:0.25700\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.20287#011validation-merror:0.25520\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.20207#011validation-merror:0.25340\u001b[0m\n", "\n", - "2024-02-08 17:35:29 Uploading - Uploading generated training model\n", - "2024-02-08 17:35:45 Completed - Training job completed\n", + "2024-02-11 12:06:53 Uploading - Uploading generated training model\n", + "2024-02-11 12:07:05 Completed - Training job completed\n", "Training seconds: 142\n", "Billable seconds: 142\n" ] @@ -2056,17 +2747,17 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 47, "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-08-17-31-38-814/output/model.tar.gz'" + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-12-03-29-991/output/model.tar.gz'" ] }, - "execution_count": 38, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -2077,7 +2768,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -2087,78 +2778,28 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "_start->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "postprocess\n", - "\n", - "postprocess\n", - "\n", - "\n", - "\n", - "xgboost-model->postprocess\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# Set the topology and get the graph object:\n", - "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", + "graph = serving_function.set_topology(\n", + " \"router\",\n", + " mlrun.serving.routers.EnrichmentModelRouter(\n", + " feature_vector_uri=\"store://feature-vectors/sagemaker-v8-admin/transactions-vector:latest\",\n", + " impute_policy={\"*\": \"$mean\"}),\n", + ")\n", + "\n", + "#graph.to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "# # add the 3 trained models to the Ensemble\n", + "# for model in project.list_models('', tag='latest'):\n", + "# name = model.spec.db_key\n", + "# serving_fn.add_model(name, class_name=\"ClassifierModel\", model_path=model.uri)\n", "\n", - "# Add the steps:\n", - "graph.to(\"XGBModelServer\",\n", - " name=\"xgboost-model\",\n", - " model_path=xgb.model_data) \\\n", - " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "serving_function.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=model_path)\n", "\n", - "# Plot to graph:\n", - "serving_function.plot(rankdir='LR')" + "# Plot the ensemble configuration\n", + "serving_function.spec.graph.plot()" ] }, { From 749d4e1f9f146210330f8789cc2dd1377710c98f Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Sun, 11 Feb 2024 14:29:01 +0000 Subject: [PATCH 09/16] update evaluate category path --- financial_payment_classification_v3.ipynb | 1583 +++++++++++++++------ src/functions/evaluate.py | 2 +- 2 files changed, 1143 insertions(+), 442 deletions(-) diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb index 6cc8ec8..b514640 100644 --- a/financial_payment_classification_v3.ipynb +++ b/financial_payment_classification_v3.ipynb @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 76, "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", "metadata": { "editable": true, @@ -108,13 +108,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 11:55:08,243 [info] Project loaded successfully: {'project_name': 'sagemaker-v8'}\n" + "> 2024-02-11 14:25:22,851 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" ] } ], "source": [ "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v8\", \n", + " name=\"sagemaker-v3\", \n", " user_project=True,\n", " parameters={\n", " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", @@ -352,7 +352,7 @@ " 4518551904499919\n", " 4333582346477646\n", " 833.26\n", - " 2024-01-08 07:10:52.429358\n", + " 2024-01-08 08:38:24.016496\n", " \n", " \n", " 1\n", @@ -360,7 +360,7 @@ " 4518551904499919\n", " 4642413144038776\n", " 596.63\n", - " 2023-12-12 05:06:42.429358\n", + " 2023-12-12 06:34:14.016496\n", " \n", " \n", " 2\n", @@ -368,7 +368,7 @@ " 4274544022939522\n", " 4952665515556751\n", " 176.76\n", - " 2023-12-22 05:42:42.429358\n", + " 2023-12-22 07:10:14.016496\n", " \n", " \n", " 3\n", @@ -376,7 +376,7 @@ " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", - " 2024-02-07 03:27:29.429358\n", + " 2024-02-07 04:55:01.016496\n", " \n", " \n", " 4\n", @@ -384,7 +384,7 @@ " 4601853246125220\n", " 4578126462896710\n", " 742.25\n", - " 2024-02-02 03:03:26.429358\n", + " 2024-02-02 04:30:58.016496\n", " \n", " \n", " ...\n", @@ -400,7 +400,7 @@ " 4405008355220324\n", " 4583355906735225\n", " 205.43\n", - " 2024-02-17 23:37:03.429358\n", + " 2024-02-18 01:04:35.016496\n", " \n", " \n", " 99993\n", @@ -408,7 +408,7 @@ " 4300416744511335\n", " 4949240916846171\n", " 151.49\n", - " 2024-01-22 06:43:28.429358\n", + " 2024-01-22 08:11:00.016496\n", " \n", " \n", " 99994\n", @@ -416,7 +416,7 @@ " 4405008355220324\n", " 4996896020767264\n", " 188.28\n", - " 2024-01-06 07:04:20.429358\n", + " 2024-01-06 08:31:52.016496\n", " \n", " \n", " 99995\n", @@ -424,7 +424,7 @@ " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2023-12-15 10:38:17.429358\n", + " 2023-12-15 12:05:49.016496\n", " \n", " \n", " 99996\n", @@ -432,7 +432,7 @@ " 4627516674144704\n", " 4250420705087194\n", " 207.92\n", - " 2024-02-11 11:55:10.429358\n", + " 2024-02-11 13:22:42.016496\n", " \n", " \n", "\n", @@ -454,17 +454,17 @@ "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", "\n", " timestamp \n", - "0 2024-01-08 07:10:52.429358 \n", - "1 2023-12-12 05:06:42.429358 \n", - "2 2023-12-22 05:42:42.429358 \n", - "3 2024-02-07 03:27:29.429358 \n", - "4 2024-02-02 03:03:26.429358 \n", + "0 2024-01-08 08:38:24.016496 \n", + "1 2023-12-12 06:34:14.016496 \n", + "2 2023-12-22 07:10:14.016496 \n", + "3 2024-02-07 04:55:01.016496 \n", + "4 2024-02-02 04:30:58.016496 \n", "... ... \n", - "99992 2024-02-17 23:37:03.429358 \n", - "99993 2024-01-22 06:43:28.429358 \n", - "99994 2024-01-06 07:04:20.429358 \n", - "99995 2023-12-15 10:38:17.429358 \n", - "99996 2024-02-11 11:55:10.429358 \n", + "99992 2024-02-18 01:04:35.016496 \n", + "99993 2024-01-22 08:11:00.016496 \n", + "99994 2024-01-06 08:31:52.016496 \n", + "99995 2023-12-15 12:05:49.016496 \n", + "99996 2024-02-11 13:22:42.016496 \n", "\n", "[99997 rows x 5 columns]" ] @@ -671,7 +671,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 14, @@ -754,7 +754,7 @@ " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", - " 2024-02-07 03:27:29.429358\n", + " 2024-02-07 04:55:01.016496\n", " 3\n", " \n", " \n", @@ -763,7 +763,7 @@ " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", - " 2023-12-10 05:59:51.429358\n", + " 2023-12-10 07:27:23.016496\n", " 7\n", " \n", " \n", @@ -772,7 +772,7 @@ " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", - " 2024-01-16 23:31:23.429358\n", + " 2024-01-17 00:58:55.016496\n", " 11\n", " \n", " \n", @@ -781,7 +781,7 @@ " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", - " 2023-11-20 06:23:53.429358\n", + " 2023-11-20 07:51:25.016496\n", " 15\n", " \n", " \n", @@ -790,7 +790,7 @@ " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", - " 2023-11-06 03:05:59.429358\n", + " 2023-11-06 04:33:31.016496\n", " 19\n", " \n", " \n", @@ -808,7 +808,7 @@ " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", - " 2024-01-09 19:01:59.429358\n", + " 2024-01-09 20:29:31.016496\n", " 99979\n", " \n", " \n", @@ -817,7 +817,7 @@ " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", - " 2024-02-05 05:15:44.429358\n", + " 2024-02-05 06:43:16.016496\n", " 99983\n", " \n", " \n", @@ -826,7 +826,7 @@ " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", - " 2023-12-14 12:49:32.429358\n", + " 2023-12-14 14:17:04.016496\n", " 99987\n", " \n", " \n", @@ -835,7 +835,7 @@ " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", - " 2024-02-20 06:08:09.429358\n", + " 2024-02-20 07:35:41.016496\n", " 99991\n", " \n", " \n", @@ -844,7 +844,7 @@ " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2023-12-15 10:38:17.429358\n", + " 2023-12-15 12:05:49.016496\n", " 99995\n", " \n", " \n", @@ -867,17 +867,17 @@ "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", " timestamp transaction_id \n", - "3 2024-02-07 03:27:29.429358 3 \n", - "7 2023-12-10 05:59:51.429358 7 \n", - "11 2024-01-16 23:31:23.429358 11 \n", - "15 2023-11-20 06:23:53.429358 15 \n", - "19 2023-11-06 03:05:59.429358 19 \n", + "3 2024-02-07 04:55:01.016496 3 \n", + "7 2023-12-10 07:27:23.016496 7 \n", + "11 2024-01-17 00:58:55.016496 11 \n", + "15 2023-11-20 07:51:25.016496 15 \n", + "19 2023-11-06 04:33:31.016496 19 \n", "... ... ... \n", - "99979 2024-01-09 19:01:59.429358 99979 \n", - "99983 2024-02-05 05:15:44.429358 99983 \n", - "99987 2023-12-14 12:49:32.429358 99987 \n", - "99991 2024-02-20 06:08:09.429358 99991 \n", - "99995 2023-12-15 10:38:17.429358 99995 \n", + "99979 2024-01-09 20:29:31.016496 99979 \n", + "99983 2024-02-05 06:43:16.016496 99983 \n", + "99987 2023-12-14 14:17:04.016496 99987 \n", + "99991 2024-02-20 07:35:41.016496 99991 \n", + "99995 2023-12-15 12:05:49.016496 99995 \n", "\n", "[24999 rows x 6 columns]" ] @@ -906,7 +906,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "id": "06c03ea5-8394-44ff-b81d-755e1c244269", "metadata": {}, "outputs": [ @@ -946,7 +946,7 @@ " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", - " 2024-04-05 04:48:41.521043\n", + " 2024-04-05 06:11:54.114091\n", " 3\n", " \n", " \n", @@ -955,7 +955,7 @@ " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", - " 2024-02-06 07:21:03.521043\n", + " 2024-02-06 08:44:16.114091\n", " 7\n", " \n", " \n", @@ -964,7 +964,7 @@ " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", - " 2024-03-15 00:52:35.521043\n", + " 2024-03-15 02:15:48.114091\n", " 11\n", " \n", " \n", @@ -973,7 +973,7 @@ " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", - " 2024-01-17 07:45:05.521043\n", + " 2024-01-17 09:08:18.114091\n", " 15\n", " \n", " \n", @@ -982,7 +982,7 @@ " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", - " 2024-01-03 04:27:11.521043\n", + " 2024-01-03 05:50:24.114091\n", " 19\n", " \n", " \n", @@ -1000,7 +1000,7 @@ " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", - " 2024-03-07 20:23:11.521043\n", + " 2024-03-07 21:46:24.114091\n", " 99979\n", " \n", " \n", @@ -1009,7 +1009,7 @@ " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", - " 2024-04-03 06:36:56.521043\n", + " 2024-04-03 08:00:09.114091\n", " 99983\n", " \n", " \n", @@ -1018,7 +1018,7 @@ " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", - " 2024-02-10 14:10:44.521043\n", + " 2024-02-10 15:33:57.114091\n", " 99987\n", " \n", " \n", @@ -1027,7 +1027,7 @@ " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", - " 2024-04-18 07:29:21.521043\n", + " 2024-04-18 08:52:34.114091\n", " 99991\n", " \n", " \n", @@ -1036,7 +1036,7 @@ " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2024-02-11 11:59:29.521043\n", + " 2024-02-11 13:22:42.114091\n", " 99995\n", " \n", " \n", @@ -1059,22 +1059,22 @@ "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", " timestamp transaction_id \n", - "3 2024-04-05 04:48:41.521043 3 \n", - "7 2024-02-06 07:21:03.521043 7 \n", - "11 2024-03-15 00:52:35.521043 11 \n", - "15 2024-01-17 07:45:05.521043 15 \n", - "19 2024-01-03 04:27:11.521043 19 \n", + "3 2024-04-05 06:11:54.114091 3 \n", + "7 2024-02-06 08:44:16.114091 7 \n", + "11 2024-03-15 02:15:48.114091 11 \n", + "15 2024-01-17 09:08:18.114091 15 \n", + "19 2024-01-03 05:50:24.114091 19 \n", "... ... ... \n", - "99979 2024-03-07 20:23:11.521043 99979 \n", - "99983 2024-04-03 06:36:56.521043 99983 \n", - "99987 2024-02-10 14:10:44.521043 99987 \n", - "99991 2024-04-18 07:29:21.521043 99991 \n", - "99995 2024-02-11 11:59:29.521043 99995 \n", + "99979 2024-03-07 21:46:24.114091 99979 \n", + "99983 2024-04-03 08:00:09.114091 99983 \n", + "99987 2024-02-10 15:33:57.114091 99987 \n", + "99991 2024-04-18 08:52:34.114091 99991 \n", + "99995 2024-02-11 13:22:42.114091 99995 \n", "\n", "[24999 rows x 6 columns]" ] }, - "execution_count": 26, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1087,12 +1087,19 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", "metadata": { "scrolled": true }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 13:22:52,204 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + ] + }, { "data": { "text/html": [ @@ -1154,9 +1161,9 @@ " 2024\n", " 4\n", " 5\n", - " 4\n", - " 48\n", - " 41\n", + " 6\n", + " 11\n", + " 54\n", " 439.890\n", " \n", " \n", @@ -1169,9 +1176,9 @@ " 2024\n", " 2\n", " 6\n", - " 7\n", - " 21\n", - " 3\n", + " 8\n", + " 44\n", + " 16\n", " 400.610\n", " \n", " \n", @@ -1184,9 +1191,9 @@ " 2024\n", " 3\n", " 15\n", - " 0\n", - " 52\n", - " 35\n", + " 2\n", + " 15\n", + " 48\n", " 211.655\n", " \n", " \n", @@ -1199,9 +1206,9 @@ " 2024\n", " 1\n", " 17\n", - " 7\n", - " 45\n", - " 5\n", + " 9\n", + " 8\n", + " 18\n", " 191.365\n", " \n", " \n", @@ -1214,9 +1221,9 @@ " 2024\n", " 1\n", " 3\n", - " 4\n", - " 27\n", - " 11\n", + " 5\n", + " 50\n", + " 24\n", " 55.885\n", " \n", " \n", @@ -1244,9 +1251,9 @@ " 2024\n", " 3\n", " 7\n", - " 20\n", - " 23\n", - " 11\n", + " 21\n", + " 46\n", + " 24\n", " 151.050\n", " \n", " \n", @@ -1259,9 +1266,9 @@ " 2024\n", " 4\n", " 3\n", - " 6\n", - " 36\n", - " 56\n", + " 8\n", + " 0\n", + " 9\n", " 57.945\n", " \n", " \n", @@ -1274,9 +1281,9 @@ " 2024\n", " 2\n", " 10\n", - " 14\n", - " 10\n", - " 44\n", + " 15\n", + " 33\n", + " 57\n", " 103.540\n", " \n", " \n", @@ -1289,9 +1296,9 @@ " 2024\n", " 4\n", " 18\n", - " 7\n", - " 29\n", - " 21\n", + " 8\n", + " 52\n", + " 34\n", " 177.790\n", " \n", " \n", @@ -1304,9 +1311,9 @@ " 2024\n", " 2\n", " 11\n", - " 11\n", - " 59\n", - " 29\n", + " 13\n", + " 22\n", + " 42\n", " 102.130\n", " \n", " \n", @@ -1345,36 +1352,36 @@ "\n", " timestamp_day timestamp_hour timestamp_minute \\\n", "transaction_id \n", - "3 5 4 48 \n", - "7 6 7 21 \n", - "11 15 0 52 \n", - "15 17 7 45 \n", - "19 3 4 27 \n", + "3 5 6 11 \n", + "7 6 8 44 \n", + "11 15 2 15 \n", + "15 17 9 8 \n", + "19 3 5 50 \n", "... ... ... ... \n", - "99979 7 20 23 \n", - "99983 3 6 36 \n", - "99987 10 14 10 \n", - "99991 18 7 29 \n", - "99995 11 11 59 \n", + "99979 7 21 46 \n", + "99983 3 8 0 \n", + "99987 10 15 33 \n", + "99991 18 8 52 \n", + "99995 11 13 22 \n", "\n", " timestamp_second distance \n", "transaction_id \n", - "3 41 439.890 \n", - "7 3 400.610 \n", - "11 35 211.655 \n", - "15 5 191.365 \n", - "19 11 55.885 \n", + "3 54 439.890 \n", + "7 16 400.610 \n", + "11 48 211.655 \n", + "15 18 191.365 \n", + "19 24 55.885 \n", "... ... ... \n", - "99979 11 151.050 \n", - "99983 56 57.945 \n", - "99987 44 103.540 \n", - "99991 21 177.790 \n", - "99995 29 102.130 \n", + "99979 24 151.050 \n", + "99983 9 57.945 \n", + "99987 57 103.540 \n", + "99991 34 177.790 \n", + "99995 42 102.130 \n", "\n", "[24999 rows x 12 columns]" ] }, - "execution_count": 27, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1386,7 +1393,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 18, "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", "metadata": {}, "outputs": [ @@ -1451,9 +1458,9 @@ " 2024\n", " 4\n", " 5\n", - " 4\n", - " 48\n", - " 41\n", + " 6\n", + " 11\n", + " 54\n", " 439.890\n", " \n", " \n", @@ -1466,9 +1473,9 @@ " 2024\n", " 2\n", " 6\n", - " 7\n", - " 21\n", - " 3\n", + " 8\n", + " 44\n", + " 16\n", " 400.610\n", " \n", " \n", @@ -1481,9 +1488,9 @@ " 2024\n", " 3\n", " 15\n", - " 0\n", - " 52\n", - " 35\n", + " 2\n", + " 15\n", + " 48\n", " 211.655\n", " \n", " \n", @@ -1496,9 +1503,9 @@ " 2024\n", " 1\n", " 17\n", - " 7\n", - " 45\n", - " 5\n", + " 9\n", + " 8\n", + " 18\n", " 191.365\n", " \n", " \n", @@ -1511,9 +1518,9 @@ " 2024\n", " 1\n", " 3\n", - " 4\n", - " 27\n", - " 11\n", + " 5\n", + " 50\n", + " 24\n", " 55.885\n", " \n", " \n", @@ -1541,9 +1548,9 @@ " 2024\n", " 3\n", " 7\n", - " 20\n", - " 23\n", - " 11\n", + " 21\n", + " 46\n", + " 24\n", " 151.050\n", " \n", " \n", @@ -1556,9 +1563,9 @@ " 2024\n", " 4\n", " 3\n", - " 6\n", - " 36\n", - " 56\n", + " 8\n", + " 0\n", + " 9\n", " 57.945\n", " \n", " \n", @@ -1571,9 +1578,9 @@ " 2024\n", " 2\n", " 10\n", - " 14\n", - " 10\n", - " 44\n", + " 15\n", + " 33\n", + " 57\n", " 103.540\n", " \n", " \n", @@ -1586,9 +1593,9 @@ " 2024\n", " 4\n", " 18\n", - " 7\n", - " 29\n", - " 21\n", + " 8\n", + " 52\n", + " 34\n", " 177.790\n", " \n", " \n", @@ -1601,9 +1608,9 @@ " 2024\n", " 2\n", " 11\n", - " 11\n", - " 59\n", - " 29\n", + " 13\n", + " 22\n", + " 42\n", " 102.130\n", " \n", " \n", @@ -1642,36 +1649,36 @@ "\n", " timestamp_day timestamp_hour timestamp_minute \\\n", "transaction_id \n", - "3 5 4 48 \n", - "7 6 7 21 \n", - "11 15 0 52 \n", - "15 17 7 45 \n", - "19 3 4 27 \n", + "3 5 6 11 \n", + "7 6 8 44 \n", + "11 15 2 15 \n", + "15 17 9 8 \n", + "19 3 5 50 \n", "... ... ... ... \n", - "99979 7 20 23 \n", - "99983 3 6 36 \n", - "99987 10 14 10 \n", - "99991 18 7 29 \n", - "99995 11 11 59 \n", + "99979 7 21 46 \n", + "99983 3 8 0 \n", + "99987 10 15 33 \n", + "99991 18 8 52 \n", + "99995 11 13 22 \n", "\n", " timestamp_second distance \n", "transaction_id \n", - "3 41 439.890 \n", - "7 3 400.610 \n", - "11 35 211.655 \n", - "15 5 191.365 \n", - "19 11 55.885 \n", + "3 54 439.890 \n", + "7 16 400.610 \n", + "11 48 211.655 \n", + "15 18 191.365 \n", + "19 24 55.885 \n", "... ... ... \n", - "99979 11 151.050 \n", - "99983 56 57.945 \n", - "99987 44 103.540 \n", - "99991 21 177.790 \n", - "99995 29 102.130 \n", + "99979 24 151.050 \n", + "99983 9 57.945 \n", + "99987 57 103.540 \n", + "99991 34 177.790 \n", + "99995 42 102.130 \n", "\n", "[24999 rows x 12 columns]" ] }, - "execution_count": 28, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1685,7 +1692,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 19, "id": "e1d377a5-cf7e-4564-8e14-10bfbaca4da2", "metadata": {}, "outputs": [ @@ -1706,7 +1713,7 @@ " 'distance']" ] }, - "execution_count": 30, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1718,7 +1725,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 20, "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", "metadata": {}, "outputs": [], @@ -1747,7 +1754,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 21, "id": "eb69d9fa-22a9-4b9f-9443-d00d9190ad55", "metadata": {}, "outputs": [ @@ -1796,9 +1803,9 @@ " 2024\n", " 4\n", " 5\n", - " 4\n", - " 48\n", - " 41\n", + " 6\n", + " 11\n", + " 54\n", " 439.890\n", " 0\n", " \n", @@ -1811,9 +1818,9 @@ " 2024\n", " 2\n", " 6\n", - " 7\n", - " 21\n", - " 3\n", + " 8\n", + " 44\n", + " 16\n", " 400.610\n", " 0\n", " \n", @@ -1826,9 +1833,9 @@ " 2024\n", " 3\n", " 15\n", - " 0\n", - " 52\n", - " 35\n", + " 2\n", + " 15\n", + " 48\n", " 211.655\n", " 0\n", " \n", @@ -1841,9 +1848,9 @@ " 2024\n", " 1\n", " 17\n", - " 7\n", - " 45\n", - " 5\n", + " 9\n", + " 8\n", + " 18\n", " 191.365\n", " 0\n", " \n", @@ -1856,9 +1863,9 @@ " 2024\n", " 1\n", " 3\n", - " 4\n", - " 27\n", - " 11\n", + " 5\n", + " 50\n", + " 24\n", " 55.885\n", " 0\n", " \n", @@ -1886,9 +1893,9 @@ " 2024\n", " 3\n", " 7\n", - " 20\n", - " 23\n", - " 11\n", + " 21\n", + " 46\n", + " 24\n", " 151.050\n", " 18\n", " \n", @@ -1901,9 +1908,9 @@ " 2024\n", " 4\n", " 3\n", - " 6\n", - " 36\n", - " 56\n", + " 8\n", + " 0\n", + " 9\n", " 57.945\n", " 18\n", " \n", @@ -1916,9 +1923,9 @@ " 2024\n", " 2\n", " 10\n", - " 14\n", - " 10\n", - " 44\n", + " 15\n", + " 33\n", + " 57\n", " 103.540\n", " 18\n", " \n", @@ -1931,9 +1938,9 @@ " 2024\n", " 4\n", " 18\n", - " 7\n", - " 29\n", - " 21\n", + " 8\n", + " 52\n", + " 34\n", " 177.790\n", " 18\n", " \n", @@ -1946,9 +1953,9 @@ " 2024\n", " 2\n", " 11\n", - " 11\n", - " 59\n", - " 29\n", + " 13\n", + " 22\n", + " 42\n", " 102.130\n", " 18\n", " \n", @@ -1972,30 +1979,30 @@ "24998 204.26 4262047194499006 4017367486513464 204.26 \n", "\n", " timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", - "0 2024 4 5 4 \n", - "1 2024 2 6 7 \n", - "2 2024 3 15 0 \n", - "3 2024 1 17 7 \n", - "4 2024 1 3 4 \n", + "0 2024 4 5 6 \n", + "1 2024 2 6 8 \n", + "2 2024 3 15 2 \n", + "3 2024 1 17 9 \n", + "4 2024 1 3 5 \n", "... ... ... ... ... \n", - "24994 2024 3 7 20 \n", - "24995 2024 4 3 6 \n", - "24996 2024 2 10 14 \n", - "24997 2024 4 18 7 \n", - "24998 2024 2 11 11 \n", + "24994 2024 3 7 21 \n", + "24995 2024 4 3 8 \n", + "24996 2024 2 10 15 \n", + "24997 2024 4 18 8 \n", + "24998 2024 2 11 13 \n", "\n", " timestamp_minute timestamp_second distance \\\n", - "0 48 41 439.890 \n", - "1 21 3 400.610 \n", - "2 52 35 211.655 \n", - "3 45 5 191.365 \n", - "4 27 11 55.885 \n", + "0 11 54 439.890 \n", + "1 44 16 400.610 \n", + "2 15 48 211.655 \n", + "3 8 18 191.365 \n", + "4 50 24 55.885 \n", "... ... ... ... \n", - "24994 23 11 151.050 \n", - "24995 36 56 57.945 \n", - "24996 10 44 103.540 \n", - "24997 29 21 177.790 \n", - "24998 59 29 102.130 \n", + "24994 46 24 151.050 \n", + "24995 0 9 57.945 \n", + "24996 33 57 103.540 \n", + "24997 52 34 177.790 \n", + "24998 22 42 102.130 \n", "\n", " transaction_category_mapped \n", "0 0 \n", @@ -2013,7 +2020,7 @@ "[24999 rows x 12 columns]" ] }, - "execution_count": 32, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2028,7 +2035,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 22, "id": "cb156ebe-9846-4ff3-a388-92362df7c741", "metadata": {}, "outputs": [ @@ -2042,13 +2049,13 @@ " 'timestamp_year': 2024,\n", " 'timestamp_month': 3,\n", " 'timestamp_day': 20,\n", - " 'timestamp_hour': 4,\n", - " 'timestamp_minute': 33,\n", - " 'timestamp_second': 58,\n", + " 'timestamp_hour': 5,\n", + " 'timestamp_minute': 57,\n", + " 'timestamp_second': 11,\n", " 'distance': 5.225}]" ] }, - "execution_count": 36, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2061,7 +2068,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 23, "id": "7737865e-21a1-4bfe-b24a-29925145280f", "metadata": {}, "outputs": [ @@ -2075,13 +2082,13 @@ " 2024,\n", " 3,\n", " 20,\n", - " 4,\n", - " 33,\n", - " 58,\n", + " 5,\n", + " 57,\n", + " 11,\n", " 5.225]]" ] }, - "execution_count": 37, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -2131,7 +2138,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 24, "id": "1cbb00b5-46bf-4a20-aad9-a03716ab97ae", "metadata": {}, "outputs": [ @@ -2181,9 +2188,9 @@ " 2024\n", " 4\n", " 5\n", - " 4\n", - " 48\n", - " 41\n", + " 6\n", + " 11\n", + " 54\n", " 439.890\n", " \n", " \n", @@ -2196,9 +2203,9 @@ " 2024\n", " 2\n", " 6\n", - " 7\n", - " 21\n", - " 3\n", + " 8\n", + " 44\n", + " 16\n", " 400.610\n", " \n", " \n", @@ -2211,9 +2218,9 @@ " 2024\n", " 3\n", " 15\n", - " 0\n", - " 52\n", - " 35\n", + " 2\n", + " 15\n", + " 48\n", " 211.655\n", " \n", " \n", @@ -2226,9 +2233,9 @@ " 2024\n", " 1\n", " 17\n", - " 7\n", - " 45\n", - " 5\n", + " 9\n", + " 8\n", + " 18\n", " 191.365\n", " \n", " \n", @@ -2241,9 +2248,9 @@ " 2024\n", " 1\n", " 3\n", - " 4\n", - " 27\n", - " 11\n", + " 5\n", + " 50\n", + " 24\n", " 55.885\n", " \n", " \n", @@ -2271,9 +2278,9 @@ " 2024\n", " 3\n", " 7\n", - " 20\n", - " 23\n", - " 11\n", + " 21\n", + " 46\n", + " 24\n", " 151.050\n", " \n", " \n", @@ -2286,9 +2293,9 @@ " 2024\n", " 4\n", " 3\n", - " 6\n", - " 36\n", - " 56\n", + " 8\n", + " 0\n", + " 9\n", " 57.945\n", " \n", " \n", @@ -2301,9 +2308,9 @@ " 2024\n", " 2\n", " 10\n", - " 14\n", - " 10\n", - " 44\n", + " 15\n", + " 33\n", + " 57\n", " 103.540\n", " \n", " \n", @@ -2316,9 +2323,9 @@ " 2024\n", " 4\n", " 18\n", - " 7\n", - " 29\n", - " 21\n", + " 8\n", + " 52\n", + " 34\n", " 177.790\n", " \n", " \n", @@ -2331,9 +2338,9 @@ " 2024\n", " 2\n", " 11\n", - " 11\n", - " 59\n", - " 29\n", + " 13\n", + " 22\n", + " 42\n", " 102.130\n", " \n", " \n", @@ -2369,17 +2376,17 @@ "24998 4017367486513464 204.26 2024 2 \n", "\n", " timestamp_day timestamp_hour timestamp_minute timestamp_second \\\n", - "0 5 4 48 41 \n", - "1 6 7 21 3 \n", - "2 15 0 52 35 \n", - "3 17 7 45 5 \n", - "4 3 4 27 11 \n", + "0 5 6 11 54 \n", + "1 6 8 44 16 \n", + "2 15 2 15 48 \n", + "3 17 9 8 18 \n", + "4 3 5 50 24 \n", "... ... ... ... ... \n", - "24994 7 20 23 11 \n", - "24995 3 6 36 56 \n", - "24996 10 14 10 44 \n", - "24997 18 7 29 21 \n", - "24998 11 11 59 29 \n", + "24994 7 21 46 24 \n", + "24995 3 8 0 9 \n", + "24996 10 15 33 57 \n", + "24997 18 8 52 34 \n", + "24998 11 13 22 42 \n", "\n", " distance \n", "0 439.890 \n", @@ -2397,7 +2404,7 @@ "[24999 rows x 12 columns]" ] }, - "execution_count": 38, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2420,7 +2427,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 25, "id": "47512de3-60ac-49c7-ace8-031959527e86", "metadata": {}, "outputs": [], @@ -2441,7 +2448,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 26, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -2461,7 +2468,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 27, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -2487,7 +2494,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 28, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -2505,7 +2512,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 29, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -2528,7 +2535,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 30, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -2553,7 +2560,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 31, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -2581,7 +2588,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 32, "id": "c24e06fc", "metadata": { "scrolled": true @@ -2591,141 +2598,141 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-12-03-29-991\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-13-24-31-606\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-11 12:03:30 Starting - Starting the training job...\n", - "2024-02-11 12:03:45 Starting - Preparing the instances for training......\n", - "2024-02-11 12:04:42 Downloading - Downloading input data...\n", - "2024-02-11 12:05:23 Downloading - Downloading the training image......\n", - "2024-02-11 12:06:08 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-11 12:06:25.555 ip-10-0-155-6.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-11 13:24:31 Starting - Starting the training job...\n", + "2024-02-11 13:24:45 Starting - Preparing the instances for training......\n", + "2024-02-11 13:25:57 Downloading - Downloading input data...\n", + "2024-02-11 13:26:31 Downloading - Downloading the training image......\n", + "2024-02-11 13:27:27 Training - Training image download completed. Training in progress....\n", + "2024-02-11 13:28:02 Uploading - Uploading generated training model\u001b[34m[2024-02-11 13:27:38.372 ip-10-0-187-204.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Train matrix has 17499 rows and 11 columns\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Validation matrix has 5000 rows\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.675 ip-10-0-155-6.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.676 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.676 ip-10-0-155-6.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.677 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.677 ip-10-0-155-6.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-11:12:06:25:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.55803#011validation-merror:0.57260\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.895 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-11 12:06:25.898 ip-10-0-155-6.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.54860#011validation-merror:0.57440\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.53409#011validation-merror:0.55980\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.53197#011validation-merror:0.55740\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.52729#011validation-merror:0.55340\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.51700#011validation-merror:0.54420\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.51254#011validation-merror:0.54000\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.50969#011validation-merror:0.53540\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.50351#011validation-merror:0.53400\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.49934#011validation-merror:0.53040\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.49380#011validation-merror:0.52340\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.48403#011validation-merror:0.51520\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.47951#011validation-merror:0.50980\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.47700#011validation-merror:0.50820\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.47111#011validation-merror:0.50620\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.46694#011validation-merror:0.50000\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.46020#011validation-merror:0.49200\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.45723#011validation-merror:0.48980\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.44991#011validation-merror:0.48140\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.44317#011validation-merror:0.47780\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.43831#011validation-merror:0.47340\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.43551#011validation-merror:0.46980\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.42711#011validation-merror:0.45920\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.42134#011validation-merror:0.45440\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.41671#011validation-merror:0.45100\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.41425#011validation-merror:0.44940\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.41157#011validation-merror:0.44800\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.40688#011validation-merror:0.44400\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.40151#011validation-merror:0.43880\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.39785#011validation-merror:0.43400\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.39339#011validation-merror:0.43000\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.38625#011validation-merror:0.42440\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.38214#011validation-merror:0.42200\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.37802#011validation-merror:0.41940\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.37254#011validation-merror:0.41460\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.36773#011validation-merror:0.41160\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.36499#011validation-merror:0.40940\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.35808#011validation-merror:0.40100\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.35574#011validation-merror:0.39940\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.35202#011validation-merror:0.39540\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.35031#011validation-merror:0.39400\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.34813#011validation-merror:0.39160\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.34339#011validation-merror:0.38680\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.33756#011validation-merror:0.38040\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.33545#011validation-merror:0.37880\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.33071#011validation-merror:0.37360\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.32808#011validation-merror:0.36900\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.32608#011validation-merror:0.36620\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.31939#011validation-merror:0.36220\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.31556#011validation-merror:0.35900\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.31396#011validation-merror:0.35520\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.31265#011validation-merror:0.35400\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.31030#011validation-merror:0.35160\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.30447#011validation-merror:0.34620\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.30236#011validation-merror:0.34480\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.29933#011validation-merror:0.34280\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.29693#011validation-merror:0.34120\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.29464#011validation-merror:0.34020\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.28904#011validation-merror:0.33420\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.28664#011validation-merror:0.33260\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.28579#011validation-merror:0.33200\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.28082#011validation-merror:0.32700\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.27813#011validation-merror:0.32440\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.27727#011validation-merror:0.32360\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.27270#011validation-merror:0.32140\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.26813#011validation-merror:0.31800\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.26464#011validation-merror:0.31440\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.26333#011validation-merror:0.31320\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.26076#011validation-merror:0.31100\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.25916#011validation-merror:0.30900\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.25613#011validation-merror:0.30560\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.25510#011validation-merror:0.30640\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.25373#011validation-merror:0.30480\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.25190#011validation-merror:0.30220\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.24950#011validation-merror:0.29860\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.24504#011validation-merror:0.29260\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.24304#011validation-merror:0.29080\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.24099#011validation-merror:0.29060\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.23904#011validation-merror:0.28880\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.23681#011validation-merror:0.28720\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.23584#011validation-merror:0.28460\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.23327#011validation-merror:0.28280\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.23161#011validation-merror:0.28120\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.22864#011validation-merror:0.27860\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.22470#011validation-merror:0.27680\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.22241#011validation-merror:0.27580\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.22087#011validation-merror:0.27440\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.21818#011validation-merror:0.27160\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.21744#011validation-merror:0.27000\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.21584#011validation-merror:0.26800\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.21476#011validation-merror:0.26640\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.21241#011validation-merror:0.26600\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.21161#011validation-merror:0.26440\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.21018#011validation-merror:0.26280\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.20927#011validation-merror:0.26160\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.20641#011validation-merror:0.25820\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.20544#011validation-merror:0.25840\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.20390#011validation-merror:0.25700\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.20287#011validation-merror:0.25520\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.20207#011validation-merror:0.25340\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Train matrix has 17499 rows and 11 columns\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Validation matrix has 5000 rows\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.503 ip-10-0-187-204.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.504 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.505 ip-10-0-187-204.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.505 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.506 ip-10-0-187-204.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-11:13:27:38:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.55820#011validation-merror:0.58340\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.726 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-11 13:27:38.729 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.54935#011validation-merror:0.57620\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.53712#011validation-merror:0.56500\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.52723#011validation-merror:0.55580\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.52312#011validation-merror:0.54980\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.51586#011validation-merror:0.54260\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.50992#011validation-merror:0.53440\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.50723#011validation-merror:0.53080\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.49729#011validation-merror:0.52640\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.49163#011validation-merror:0.52160\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.48740#011validation-merror:0.51680\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.48146#011validation-merror:0.51300\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.47597#011validation-merror:0.50720\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.47151#011validation-merror:0.50540\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.46208#011validation-merror:0.49700\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.45917#011validation-merror:0.49240\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.45363#011validation-merror:0.49120\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.44460#011validation-merror:0.48360\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.44134#011validation-merror:0.47800\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.43745#011validation-merror:0.47400\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.43420#011validation-merror:0.47180\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.42848#011validation-merror:0.46740\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.42100#011validation-merror:0.46000\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.41385#011validation-merror:0.45320\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.40894#011validation-merror:0.45100\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.40339#011validation-merror:0.44400\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.39734#011validation-merror:0.43880\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.39105#011validation-merror:0.43260\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.38871#011validation-merror:0.42860\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.38511#011validation-merror:0.42520\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.38356#011validation-merror:0.42420\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.37608#011validation-merror:0.41720\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.36973#011validation-merror:0.41240\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.36471#011validation-merror:0.40640\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.36271#011validation-merror:0.40500\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.35882#011validation-merror:0.40200\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.35791#011validation-merror:0.40060\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.35253#011validation-merror:0.39520\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.34796#011validation-merror:0.39300\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.34722#011validation-merror:0.39180\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.34482#011validation-merror:0.39040\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.34196#011validation-merror:0.38640\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.33910#011validation-merror:0.38160\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.33465#011validation-merror:0.37620\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.33179#011validation-merror:0.37440\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.32893#011validation-merror:0.37380\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.32533#011validation-merror:0.37220\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.32333#011validation-merror:0.37080\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.31842#011validation-merror:0.36740\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.31476#011validation-merror:0.36140\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.31145#011validation-merror:0.35800\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.30973#011validation-merror:0.35640\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.30642#011validation-merror:0.35560\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.30476#011validation-merror:0.35540\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.30299#011validation-merror:0.35300\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.30133#011validation-merror:0.35160\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.29945#011validation-merror:0.34920\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.29545#011validation-merror:0.34740\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.29185#011validation-merror:0.34100\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.29013#011validation-merror:0.33840\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.28670#011validation-merror:0.33640\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.28047#011validation-merror:0.33180\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.27927#011validation-merror:0.33040\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.27584#011validation-merror:0.32720\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.27264#011validation-merror:0.32340\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.26773#011validation-merror:0.31760\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.26459#011validation-merror:0.31400\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.26407#011validation-merror:0.31340\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.26219#011validation-merror:0.31160\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.25899#011validation-merror:0.30900\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.25727#011validation-merror:0.30800\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.25533#011validation-merror:0.30580\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.25350#011validation-merror:0.30440\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.25310#011validation-merror:0.30260\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.25001#011validation-merror:0.29880\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.24653#011validation-merror:0.29380\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.24350#011validation-merror:0.29340\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.24104#011validation-merror:0.29100\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.23824#011validation-merror:0.28880\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.23681#011validation-merror:0.28700\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.23367#011validation-merror:0.28320\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.23219#011validation-merror:0.28320\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.23104#011validation-merror:0.28120\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.22939#011validation-merror:0.27940\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.22830#011validation-merror:0.27800\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.22738#011validation-merror:0.27700\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.22647#011validation-merror:0.27640\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.22618#011validation-merror:0.27560\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.22464#011validation-merror:0.27580\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.22310#011validation-merror:0.27140\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.22093#011validation-merror:0.27080\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.22007#011validation-merror:0.26900\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.21796#011validation-merror:0.26780\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.21681#011validation-merror:0.26580\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.21590#011validation-merror:0.26480\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.21350#011validation-merror:0.26280\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.21001#011validation-merror:0.26200\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.20744#011validation-merror:0.25940\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.20601#011validation-merror:0.25780\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.20481#011validation-merror:0.25620\u001b[0m\n", "\n", - "2024-02-11 12:06:53 Uploading - Uploading generated training model\n", - "2024-02-11 12:07:05 Completed - Training job completed\n", + "2024-02-11 13:28:18 Completed - Training job completed\n", "Training seconds: 142\n", "Billable seconds: 142\n" ] @@ -2747,17 +2754,17 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 33, "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-12-03-29-991/output/model.tar.gz'" + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-13-24-31-606/output/model.tar.gz'" ] }, - "execution_count": 47, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2768,7 +2775,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -2776,40 +2783,203 @@ "serving_function = project.get_function(\"serving\")" ] }, + { + "cell_type": "code", + "execution_count": 60, + "id": "911457fa-812d-4991-a31c-4dfcb1593d3e", + "metadata": {}, + "outputs": [], + "source": [ + "serving_function_v2 = project.set_function(\n", + " func=\"src/functions/serving.py\",\n", + " name=\"serving-v2\",\n", + " kind=\"serving\",\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", + "id": "2881c17d-dd84-43d7-acc7-83e40c8110d3", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_start->\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "xgboost-model\n", + "\n", + "xgboost-model\n", + "\n", + "\n", + "\n", + "->xgboost-model\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "graph = serving_function.set_topology(\n", + "graph = serving_function_v2.set_topology(\n", " \"router\",\n", " mlrun.serving.routers.EnrichmentModelRouter(\n", - " feature_vector_uri=\"store://feature-vectors/sagemaker-v8-admin/transactions-vector:latest\",\n", + " feature_vector_uri=transactions_fv.uri,\n", " impute_policy={\"*\": \"$mean\"}),\n", ")\n", - "\n", - "#graph.to(handler=\"postprocess\", name=\"postprocess\").respond()\n", - "# # add the 3 trained models to the Ensemble\n", - "# for model in project.list_models('', tag='latest'):\n", - "# name = model.spec.db_key\n", - "# serving_fn.add_model(name, class_name=\"ClassifierModel\", model_path=model.uri)\n", - "\n", - "serving_function.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=model_path)\n", + "serving_function_v2.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=xgb.model_data)\n", "\n", "# Plot the ensemble configuration\n", - "serving_function.spec.graph.plot()" + "serving_function_v2.spec.graph.plot()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, + "id": "0ab0bcd2-5c70-4f48-bff9-d060f027e8e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 14:19:33,242 [info] model xgboost-model was loaded\n", + "> 2024-02-11 14:19:33,243 [info] Loaded ['xgboost-model']\n" + ] + } + ], + "source": [ + "server = serving_function_v2.to_mock_server()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "dd57cfcd-5878-4775-83ee-422dc2261ce8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'inputs': [[10.45, 4365439229004487, 4895135853273971, 10.45, 2024, 3, 20, 5, 57, 11, 5.225]]}\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': '4674ea5dbf944924b1b17c6fba1fb6e9',\n", + " 'model_name': 'xgboost-model',\n", + " 'outputs': [[0.0007725695031695068,\n", + " 0.18554285168647766,\n", + " 0.0005761922220699489,\n", + " 0.421852707862854,\n", + " 0.007665876764804125,\n", + " 0.012660829350352287,\n", + " 0.22086532413959503,\n", + " 0.026935890316963196,\n", + " 0.0004489392740651965,\n", + " 0.0073969378136098385,\n", + " 0.08978493511676788,\n", + " 0.0005328097031451762,\n", + " 0.020548472180962563,\n", + " 0.0011180019937455654,\n", + " 0.0005254872958175838,\n", + " 0.00045757496263831854,\n", + " 0.0003292290784884244,\n", + " 0.0009559299796819687,\n", + " 0.0010294184321537614]]}" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = server.test(body={'inputs':[[24995]]})\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 65, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 14:20:02,832 [info] Starting remote function deploy\n", + "2024-02-11 14:20:03 (info) Deploying function\n", + "2024-02-11 14:20:03 (info) Building\n", + "2024-02-11 14:20:03 (info) Staging files and preparing base images\n", + "2024-02-11 14:20:03 (info) Building processor image\n", + "2024-02-11 14:21:09 (info) Build complete\n", + "2024-02-11 14:21:17 (info) Function deploy complete\n", + "> 2024-02-11 14:21:24,362 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-v3-admin-serving-v2.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-v3-admin-serving-v2'})" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "project.deploy_function(\"serving\")" + "project.deploy_function(\"serving-v2\")" ] }, { @@ -2819,17 +2989,76 @@ "metadata": {}, "outputs": [], "source": [ - "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" + "#samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", + "execution_count": 66, + "id": "ac19dc03-01e2-4e29-ba75-a34804833d5c", "metadata": {}, "outputs": [], "source": [ - "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" + "serving_function_v2 = project.get_function(\"serving-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 14:22:55,332 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//v2/models/xgboost-model/predict'}\n" + ] + } + ], + "source": [ + "response = serving_function_v2.invoke(path='/v2/models/xgboost-model/predict', body={\"inputs\": [[24995]]})" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "57eeaddc-654a-41d2-bb51-4a9a787a3311", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '56b180c3-2e57-4bae-89cb-92b2c31f5c9b',\n", + " 'model_name': 'xgboost-model',\n", + " 'outputs': [[0.0007725695031695068,\n", + " 0.18554285168647766,\n", + " 0.0005761922220699489,\n", + " 0.421852707862854,\n", + " 0.007665876764804125,\n", + " 0.012660829350352287,\n", + " 0.22086532413959503,\n", + " 0.026935890316963196,\n", + " 0.0004489392740651965,\n", + " 0.0073969378136098385,\n", + " 0.08978493511676788,\n", + " 0.0005328097031451762,\n", + " 0.020548472180962563,\n", + " 0.0011180019937455654,\n", + " 0.0005254872958175838,\n", + " 0.00045757496263831854,\n", + " 0.0003292290784884244,\n", + " 0.0009559299796819687,\n", + " 0.0010294184321537614]]}" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response" ] }, { @@ -2845,6 +3074,14 @@ { "cell_type": "code", "execution_count": null, + "id": "35ff008b-f4e4-491b-b1e8-3b0a652c35fc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 77, "id": "2e863ea7-5804-4637-b677-390c305cabfe", "metadata": {}, "outputs": [], @@ -2862,7 +3099,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "id": "ca4f7e49", "metadata": {}, "outputs": [], @@ -2880,10 +3117,257 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 14:25:34,435 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '4e422c0c56c049469ee777c86b3bde01', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", + "> 2024-02-11 14:25:34,785 [info] Job is running in the background, pod: evaluate-evaluate-ppcb5\n", + "[14:25:39] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", + "configuration generated by an older version of XGBoost, please export the model by calling\n", + "`Booster.save_model` from that version first, then load it back in current version. See:\n", + "\n", + " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", + "\n", + "for more details about differences between saving model and serializing.\n", + "\n", + "> 2024-02-11 14:25:39,434 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 4e422c0c56c049469ee777c86b3bde01 -p sagemaker-v3-admin', 'logs_cmd': 'mlrun logs 4e422c0c56c049469ee777c86b3bde01 -p sagemaker-v3-admin'}\n", + "> 2024-02-11 14:25:39,434 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-v3-admin/jobs/monitor/4e422c0c56c049469ee777c86b3bde01/overview'}\n", + "> 2024-02-11 14:25:39,435 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-v3-admin0Feb 11 14:25:37completedevaluate-evaluate
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-ppcb5
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-13-24-31-606/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category_mapped
factorize_key={'Uncategorized': 0, 'Entertainment': 1, 'Education': 2, 'Shopping': 3, 'Personal Care': 4, 'Health and Fitness': 5, 'Food and Dining': 6, 'Gifts and Donations': 7, 'Investments': 8, 'Bills and Utilities': 9, 'Auto and Transport': 10, 'Travel': 11, 'Fees and Charges': 12, 'Business Services': 13, 'Personal Services': 14, 'Taxes': 15, 'Gambling': 16, 'Home': 17, 'Pension and insurances': 18}
classification_report
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 14:25:45,077 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + ] + } + ], "source": [ "evaluate_run = evaluate_function.run(\n", " handler=\"evaluate\",\n", @@ -2891,7 +3375,7 @@ " \"model_path\": xgb.model_data,\n", " \"model_name\": \"xgboost-model\",\n", " \"test_set\": s3_data,\n", - " \"label_column\": \"transaction_category\",\n", + " \"label_column\": \"transaction_category_mapped\",\n", " \"factorize_key\": factorize_key,\n", " },\n", " returns=[\"classification_report: dataset\"])" @@ -2907,10 +3391,227 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precisionrecallf1-scoresupport
Uncategorized0.8333330.6250000.7142868.0000
Entertainment0.6809950.8314920.748756362.0000
Education0.9230770.8571430.88888914.0000
Shopping0.7102000.8662790.780513860.0000
Personal Care0.8800000.9565220.91666723.0000
Health and Fitness0.8750000.5691060.689655123.0000
Food and Dining0.8451610.6036870.704301217.0000
Gifts and Donations0.7380950.4558820.56363668.0000
Investments1.0000000.9090910.95238122.0000
Bills and Utilities0.9354840.6304350.75324792.0000
Auto and Transport0.7583730.6673680.709966475.0000
Travel0.8333330.6944440.75757636.0000
Fees and Charges1.0000000.8275860.90566029.0000
Business Services0.9696970.6808510.80000047.0000
Personal Services0.8947370.7083330.79069824.0000
Taxes0.9000000.6923080.78260913.0000
Gambling1.0000000.7500000.8571438.0000
Home0.8863640.7500000.81250052.0000
Pension and insurances0.8636360.7037040.77551027.0000
accuracy0.7532000.7532000.7532000.7532
macro avg0.8698680.7252230.7844212500.0000
weighted avg0.7687150.7532000.7501362500.0000
\n", + "
" + ], + "text/plain": [ + " precision recall f1-score support\n", + "Uncategorized 0.833333 0.625000 0.714286 8.0000\n", + "Entertainment 0.680995 0.831492 0.748756 362.0000\n", + "Education 0.923077 0.857143 0.888889 14.0000\n", + "Shopping 0.710200 0.866279 0.780513 860.0000\n", + "Personal Care 0.880000 0.956522 0.916667 23.0000\n", + "Health and Fitness 0.875000 0.569106 0.689655 123.0000\n", + "Food and Dining 0.845161 0.603687 0.704301 217.0000\n", + "Gifts and Donations 0.738095 0.455882 0.563636 68.0000\n", + "Investments 1.000000 0.909091 0.952381 22.0000\n", + "Bills and Utilities 0.935484 0.630435 0.753247 92.0000\n", + "Auto and Transport 0.758373 0.667368 0.709966 475.0000\n", + "Travel 0.833333 0.694444 0.757576 36.0000\n", + "Fees and Charges 1.000000 0.827586 0.905660 29.0000\n", + "Business Services 0.969697 0.680851 0.800000 47.0000\n", + "Personal Services 0.894737 0.708333 0.790698 24.0000\n", + "Taxes 0.900000 0.692308 0.782609 13.0000\n", + "Gambling 1.000000 0.750000 0.857143 8.0000\n", + "Home 0.886364 0.750000 0.812500 52.0000\n", + "Pension and insurances 0.863636 0.703704 0.775510 27.0000\n", + "accuracy 0.753200 0.753200 0.753200 0.7532\n", + "macro avg 0.869868 0.725223 0.784421 2500.0000\n", + "weighted avg 0.768715 0.753200 0.750136 2500.0000" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "evaluate_run.artifact(\"classification_report\").as_df()" ] diff --git a/src/functions/evaluate.py b/src/functions/evaluate.py index 34a7359..c905379 100644 --- a/src/functions/evaluate.py +++ b/src/functions/evaluate.py @@ -67,7 +67,7 @@ def evaluate( # generate classification report: report = classification_report( - y_true=test_set["transaction_category"].to_list(), + y_true=test_set["transaction_category_mapped"].to_list(), y_pred=predictions, target_names=factorize_key, output_dict=True, From 2bf17fc663da8935c067ff421fa387bc936a642a Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Sun, 11 Feb 2024 16:42:13 +0000 Subject: [PATCH 10/16] updating entity id from id to category --- financial_payment_classification_v3.ipynb | 1953 ++++++++++----------- 1 file changed, 883 insertions(+), 1070 deletions(-) diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb index b514640..dd598ad 100644 --- a/financial_payment_classification_v3.ipynb +++ b/financial_payment_classification_v3.ipynb @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 3, "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", "metadata": { "editable": true, @@ -108,7 +108,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 14:25:22,851 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" + "> 2024-02-11 16:21:51,304 [info] Identified pre-initialized git repo, using it: {'url': 'git://github.com/aviaIguazio/demo-sagemaker.git#refs/heads/development'}\n", + "> 2024-02-11 16:22:06,708 [info] Created and saved project: {'name': 'sagemaker-v3-admin', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n", + "> 2024-02-11 16:22:07,592 [info] Project created successfully: {'project_name': 'sagemaker-v3', 'stored_in_db': True}\n" ] } ], @@ -133,19 +135,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "42c5d6d0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n" - ] - } - ], + "outputs": [], "source": [ "import boto3\n", "import io\n", @@ -160,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", "metadata": {}, "outputs": [], @@ -170,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", "metadata": {}, "outputs": [], @@ -190,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c0e4db17", "metadata": {}, "outputs": [], @@ -214,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "43946b9f", "metadata": {}, "outputs": [], @@ -254,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "5ff0d280", "metadata": {}, "outputs": [], @@ -277,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "a477abd7", "metadata": {}, "outputs": [], @@ -313,171 +306,13 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "8c15f00d-8f89-41ec-aa22-f23fc394d1b4", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamounttimestamp
0Uncategorized45185519044999194333582346477646833.262024-01-08 08:38:24.016496
1Uncategorized45185519044999194642413144038776596.632023-12-12 06:34:14.016496
2Uncategorized42745440229395224952665515556751176.762023-12-22 07:10:14.016496
3Uncategorized45185519044999194457298962882528879.782024-02-07 04:55:01.016496
4Uncategorized46018532461252204578126462896710742.252024-02-02 04:30:58.016496
..................
99992Pension and insurances44050083552203244583355906735225205.432024-02-18 01:04:35.016496
99993Pension and insurances43004167445113354949240916846171151.492024-01-22 08:11:00.016496
99994Pension and insurances44050083552203244996896020767264188.282024-01-06 08:31:52.016496
99995Pension and insurances42620471944990064017367486513464204.262023-12-15 12:05:49.016496
99996Pension and insurances46275166741447044250420705087194207.922024-02-11 13:22:42.016496
\n", - "

99997 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "0 Uncategorized 4518551904499919 4333582346477646 833.26 \n", - "1 Uncategorized 4518551904499919 4642413144038776 596.63 \n", - "2 Uncategorized 4274544022939522 4952665515556751 176.76 \n", - "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "4 Uncategorized 4601853246125220 4578126462896710 742.25 \n", - "... ... ... ... ... \n", - "99992 Pension and insurances 4405008355220324 4583355906735225 205.43 \n", - "99993 Pension and insurances 4300416744511335 4949240916846171 151.49 \n", - "99994 Pension and insurances 4405008355220324 4996896020767264 188.28 \n", - "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", - "\n", - " timestamp \n", - "0 2024-01-08 08:38:24.016496 \n", - "1 2023-12-12 06:34:14.016496 \n", - "2 2023-12-22 07:10:14.016496 \n", - "3 2024-02-07 04:55:01.016496 \n", - "4 2024-02-02 04:30:58.016496 \n", - "... ... \n", - "99992 2024-02-18 01:04:35.016496 \n", - "99993 2024-01-22 08:11:00.016496 \n", - "99994 2024-01-06 08:31:52.016496 \n", - "99995 2023-12-15 12:05:49.016496 \n", - "99996 2024-02-11 13:22:42.016496 \n", - "\n", - "[99997 rows x 5 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from utils import update_timestamps\n", - "data=update_timestamps(data)\n", - "data" + "data=update_timestamps(data)" ] }, { @@ -490,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", "metadata": {}, "outputs": [], @@ -520,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", "metadata": {}, "outputs": [], @@ -536,28 +371,17 @@ "\n", "def calculate_category_distance(event): \n", " column_name ='transaction_category_mapped'\n", - " #print(type(event))\n", - " #print(event)\n", " event = pop_and_move_to_start(event,column_name)\n", - " #print(event)\n", " category = event[column_name]\n", - " #event[category+'distance'] = abs(event['amount']-event[category+'_avg_1d'])\n", - " event['distance'] = abs(event['amount']/2)\n", + " event['distance'] = abs(event['amount']-event['amount_avg_1d'])\n", + " #event['distance'] = abs(event['amount']/2)\n", " \n", " return event" ] }, { "cell_type": "code", - "execution_count": null, - "id": "7d236f74-c850-4ff0-a8ee-e4a89966ddb1", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "id": "4101c303-2da3-431b-9375-9fa1747070af", "metadata": {}, "outputs": [ @@ -671,10 +495,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -683,17 +507,9 @@ "import mlrun.feature_store as fstore\n", "from mlrun.feature_store.steps import OneHotEncoder, MapValues, DateExtractor, DropFeatures\n", "\n", - "# Define and add value mapping\n", - "main_categories = list(factorize_key.keys())\n", - "\n", - "#main_categories = part_categories\n", - "\n", - "# One Hot Encode the newly defined mappings\n", - "#one_hot_encoder_mapping = {'transaction_category': main_categories}\n", - "\n", "# creating feature set\n", "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", - " entities=[fstore.Entity(\"transaction_id\")],\n", + " entities=[fstore.Entity(\"transaction_category\")],\n", " description=\"transactions feature set\")\n", "# setting up the graph\n", "extended_transactions_set.graph \\\n", @@ -704,7 +520,7 @@ "\n", "extended_transactions_set.graph \\\n", " .to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates') \\\n", - " .to(DropFeatures(features=['timestamp','transaction_category']))\n", + " .to(DropFeatures(features=['timestamp']))\n", "\n", "\n", "extended_transactions_set.set_targets()\n", @@ -714,7 +530,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "id": "53eb2151-447a-4eb0-be7f-a07f1cbea32d", "metadata": {}, "outputs": [ @@ -754,7 +570,7 @@ " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", - " 2024-02-07 04:55:01.016496\n", + " 2024-02-07 07:55:08.158933\n", " 3\n", " \n", " \n", @@ -763,7 +579,7 @@ " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", - " 2023-12-10 07:27:23.016496\n", + " 2023-12-10 10:27:30.158933\n", " 7\n", " \n", " \n", @@ -772,7 +588,7 @@ " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", - " 2024-01-17 00:58:55.016496\n", + " 2024-01-17 03:59:02.158933\n", " 11\n", " \n", " \n", @@ -781,7 +597,7 @@ " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", - " 2023-11-20 07:51:25.016496\n", + " 2023-11-20 10:51:32.158933\n", " 15\n", " \n", " \n", @@ -790,7 +606,7 @@ " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", - " 2023-11-06 04:33:31.016496\n", + " 2023-11-06 07:33:38.158933\n", " 19\n", " \n", " \n", @@ -808,7 +624,7 @@ " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", - " 2024-01-09 20:29:31.016496\n", + " 2024-01-09 23:29:38.158933\n", " 99979\n", " \n", " \n", @@ -817,7 +633,7 @@ " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", - " 2024-02-05 06:43:16.016496\n", + " 2024-02-05 09:43:23.158933\n", " 99983\n", " \n", " \n", @@ -826,7 +642,7 @@ " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", - " 2023-12-14 14:17:04.016496\n", + " 2023-12-14 17:17:11.158933\n", " 99987\n", " \n", " \n", @@ -835,7 +651,7 @@ " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", - " 2024-02-20 07:35:41.016496\n", + " 2024-02-20 10:35:48.158933\n", " 99991\n", " \n", " \n", @@ -844,7 +660,7 @@ " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2023-12-15 12:05:49.016496\n", + " 2023-12-15 15:05:56.158933\n", " 99995\n", " \n", " \n", @@ -867,22 +683,22 @@ "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", " timestamp transaction_id \n", - "3 2024-02-07 04:55:01.016496 3 \n", - "7 2023-12-10 07:27:23.016496 7 \n", - "11 2024-01-17 00:58:55.016496 11 \n", - "15 2023-11-20 07:51:25.016496 15 \n", - "19 2023-11-06 04:33:31.016496 19 \n", + "3 2024-02-07 07:55:08.158933 3 \n", + "7 2023-12-10 10:27:30.158933 7 \n", + "11 2024-01-17 03:59:02.158933 11 \n", + "15 2023-11-20 10:51:32.158933 15 \n", + "19 2023-11-06 07:33:38.158933 19 \n", "... ... ... \n", - "99979 2024-01-09 20:29:31.016496 99979 \n", - "99983 2024-02-05 06:43:16.016496 99983 \n", - "99987 2023-12-14 14:17:04.016496 99987 \n", - "99991 2024-02-20 07:35:41.016496 99991 \n", - "99995 2023-12-15 12:05:49.016496 99995 \n", + "99979 2024-01-09 23:29:38.158933 99979 \n", + "99983 2024-02-05 09:43:23.158933 99983 \n", + "99987 2023-12-14 17:17:11.158933 99987 \n", + "99991 2024-02-20 10:35:48.158933 99991 \n", + "99995 2023-12-15 15:05:56.158933 99995 \n", "\n", "[24999 rows x 6 columns]" ] }, - "execution_count": 15, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -906,7 +722,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "id": "06c03ea5-8394-44ff-b81d-755e1c244269", "metadata": {}, "outputs": [ @@ -946,7 +762,7 @@ " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", - " 2024-04-05 06:11:54.114091\n", + " 2024-04-05 09:12:27.257945\n", " 3\n", " \n", " \n", @@ -955,7 +771,7 @@ " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", - " 2024-02-06 08:44:16.114091\n", + " 2024-02-06 11:44:49.257945\n", " 7\n", " \n", " \n", @@ -964,7 +780,7 @@ " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", - " 2024-03-15 02:15:48.114091\n", + " 2024-03-15 05:16:21.257945\n", " 11\n", " \n", " \n", @@ -973,7 +789,7 @@ " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", - " 2024-01-17 09:08:18.114091\n", + " 2024-01-17 12:08:51.257945\n", " 15\n", " \n", " \n", @@ -982,7 +798,7 @@ " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", - " 2024-01-03 05:50:24.114091\n", + " 2024-01-03 08:50:57.257945\n", " 19\n", " \n", " \n", @@ -1000,7 +816,7 @@ " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", - " 2024-03-07 21:46:24.114091\n", + " 2024-03-08 00:46:57.257945\n", " 99979\n", " \n", " \n", @@ -1009,7 +825,7 @@ " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", - " 2024-04-03 08:00:09.114091\n", + " 2024-04-03 11:00:42.257945\n", " 99983\n", " \n", " \n", @@ -1018,7 +834,7 @@ " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", - " 2024-02-10 15:33:57.114091\n", + " 2024-02-10 18:34:30.257945\n", " 99987\n", " \n", " \n", @@ -1027,7 +843,7 @@ " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", - " 2024-04-18 08:52:34.114091\n", + " 2024-04-18 11:53:07.257945\n", " 99991\n", " \n", " \n", @@ -1036,7 +852,7 @@ " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", - " 2024-02-11 13:22:42.114091\n", + " 2024-02-11 16:23:15.257945\n", " 99995\n", " \n", " \n", @@ -1059,22 +875,22 @@ "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", " timestamp transaction_id \n", - "3 2024-04-05 06:11:54.114091 3 \n", - "7 2024-02-06 08:44:16.114091 7 \n", - "11 2024-03-15 02:15:48.114091 11 \n", - "15 2024-01-17 09:08:18.114091 15 \n", - "19 2024-01-03 05:50:24.114091 19 \n", + "3 2024-04-05 09:12:27.257945 3 \n", + "7 2024-02-06 11:44:49.257945 7 \n", + "11 2024-03-15 05:16:21.257945 11 \n", + "15 2024-01-17 12:08:51.257945 15 \n", + "19 2024-01-03 08:50:57.257945 19 \n", "... ... ... \n", - "99979 2024-03-07 21:46:24.114091 99979 \n", - "99983 2024-04-03 08:00:09.114091 99983 \n", - "99987 2024-02-10 15:33:57.114091 99987 \n", - "99991 2024-04-18 08:52:34.114091 99991 \n", - "99995 2024-02-11 13:22:42.114091 99995 \n", + "99979 2024-03-08 00:46:57.257945 99979 \n", + "99983 2024-04-03 11:00:42.257945 99983 \n", + "99987 2024-02-10 18:34:30.257945 99987 \n", + "99991 2024-04-18 11:53:07.257945 99991 \n", + "99995 2024-02-11 16:23:15.257945 99995 \n", "\n", "[24999 rows x 6 columns]" ] }, - "execution_count": 16, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1087,19 +903,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 13:22:52,204 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" - ] - }, { "data": { "text/html": [ @@ -1126,6 +935,7 @@ " receiver_id\n", " sender_id\n", " amount\n", + " transaction_id\n", " timestamp_year\n", " timestamp_month\n", " timestamp_day\n", @@ -1135,7 +945,8 @@ " distance\n", " \n", " \n", - " transaction_id\n", + " transaction_category\n", + " \n", " \n", " \n", " \n", @@ -1152,79 +963,84 @@ " \n", " \n", " \n", - " 3\n", + " Uncategorized\n", " 0\n", - " 879.78\n", + " 879.780000\n", " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", + " 3\n", " 2024\n", " 4\n", " 5\n", - " 6\n", - " 11\n", - " 54\n", - " 439.890\n", + " 9\n", + " 12\n", + " 27\n", + " 0.000000\n", " \n", " \n", - " 7\n", + " Uncategorized\n", " 0\n", - " 801.22\n", + " 840.500000\n", " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", + " 7\n", " 2024\n", " 2\n", " 6\n", - " 8\n", + " 11\n", " 44\n", - " 16\n", - " 400.610\n", + " 49\n", + " 39.280000\n", " \n", " \n", - " 11\n", + " Uncategorized\n", " 0\n", - " 423.31\n", + " 701.436667\n", " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", + " 11\n", " 2024\n", " 3\n", " 15\n", - " 2\n", - " 15\n", - " 48\n", - " 211.655\n", + " 5\n", + " 16\n", + " 21\n", + " 278.126667\n", " \n", " \n", - " 15\n", + " Uncategorized\n", " 0\n", - " 382.73\n", + " 621.760000\n", " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", + " 15\n", " 2024\n", " 1\n", " 17\n", - " 9\n", + " 12\n", " 8\n", - " 18\n", - " 191.365\n", + " 51\n", + " 239.030000\n", " \n", " \n", - " 19\n", + " Uncategorized\n", " 0\n", - " 111.77\n", + " 519.762000\n", " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", + " 19\n", " 2024\n", " 1\n", " 3\n", - " 5\n", + " 8\n", " 50\n", - " 24\n", - " 55.885\n", + " 57\n", + " 407.992000\n", " \n", " \n", " ...\n", @@ -1240,148 +1056,168 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", - " 99979\n", + " Pension and insurances\n", " 18\n", - " 302.10\n", + " 211.453636\n", " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", + " 99979\n", " 2024\n", " 3\n", - " 7\n", - " 21\n", + " 8\n", + " 0\n", " 46\n", - " 24\n", - " 151.050\n", + " 57\n", + " 90.646364\n", " \n", " \n", - " 99983\n", + " Pension and insurances\n", " 18\n", - " 115.89\n", + " 211.107391\n", " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", + " 99983\n", " 2024\n", " 4\n", " 3\n", - " 8\n", + " 11\n", " 0\n", - " 9\n", - " 57.945\n", + " 42\n", + " 95.217391\n", " \n", " \n", - " 99987\n", + " Pension and insurances\n", " 18\n", - " 207.08\n", + " 211.092852\n", " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", + " 99987\n", " 2024\n", " 2\n", " 10\n", - " 15\n", - " 33\n", - " 57\n", - " 103.540\n", + " 18\n", + " 34\n", + " 30\n", + " 4.012852\n", " \n", " \n", - " 99991\n", + " Pension and insurances\n", " 18\n", - " 355.58\n", + " 211.612590\n", " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", + " 99991\n", " 2024\n", " 4\n", " 18\n", - " 8\n", - " 52\n", - " 34\n", - " 177.790\n", + " 11\n", + " 53\n", + " 7\n", + " 143.967410\n", " \n", " \n", - " 99995\n", + " Pension and insurances\n", " 18\n", - " 204.26\n", + " 211.586237\n", " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", + " 99995\n", " 2024\n", " 2\n", " 11\n", - " 13\n", - " 22\n", - " 42\n", - " 102.130\n", + " 16\n", + " 23\n", + " 15\n", + " 7.326237\n", " \n", " \n", "\n", - "

24999 rows × 12 columns

\n", + "

24999 rows × 13 columns

\n", "" ], "text/plain": [ - " transaction_category_mapped amount_avg_1d receiver_id \\\n", - "transaction_id \n", - "3 0 879.78 4518551904499919 \n", - "7 0 801.22 4757951915669080 \n", - "11 0 423.31 4518551904499919 \n", - "15 0 382.73 4518551904499919 \n", - "19 0 111.77 4098088980692974 \n", - "... ... ... ... \n", - "99979 18 302.10 4179606860088849 \n", - "99983 18 115.89 4751538620733305 \n", - "99987 18 207.08 4405008355220324 \n", - "99991 18 355.58 4092115788877543 \n", - "99995 18 204.26 4262047194499006 \n", + " transaction_category_mapped amount_avg_1d \\\n", + "transaction_category \n", + "Uncategorized 0 879.780000 \n", + "Uncategorized 0 840.500000 \n", + "Uncategorized 0 701.436667 \n", + "Uncategorized 0 621.760000 \n", + "Uncategorized 0 519.762000 \n", + "... ... ... \n", + "Pension and insurances 18 211.453636 \n", + "Pension and insurances 18 211.107391 \n", + "Pension and insurances 18 211.092852 \n", + "Pension and insurances 18 211.612590 \n", + "Pension and insurances 18 211.586237 \n", + "\n", + " receiver_id sender_id amount \\\n", + "transaction_category \n", + "Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "Uncategorized 4757951915669080 4655296518888015 801.22 \n", + "Uncategorized 4518551904499919 4910949333064003 423.31 \n", + "Uncategorized 4518551904499919 4415760195692405 382.73 \n", + "Uncategorized 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... \n", + "Pension and insurances 4179606860088849 4359198069543354 302.10 \n", + "Pension and insurances 4751538620733305 4021524999937895 115.89 \n", + "Pension and insurances 4405008355220324 4165276502284291 207.08 \n", + "Pension and insurances 4092115788877543 4328901131757235 355.58 \n", + "Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", - " sender_id amount timestamp_year timestamp_month \\\n", - "transaction_id \n", - "3 4457298962882528 879.78 2024 4 \n", - "7 4655296518888015 801.22 2024 2 \n", - "11 4910949333064003 423.31 2024 3 \n", - "15 4415760195692405 382.73 2024 1 \n", - "19 4412940106031926 111.77 2024 1 \n", - "... ... ... ... ... \n", - "99979 4359198069543354 302.10 2024 3 \n", - "99983 4021524999937895 115.89 2024 4 \n", - "99987 4165276502284291 207.08 2024 2 \n", - "99991 4328901131757235 355.58 2024 4 \n", - "99995 4017367486513464 204.26 2024 2 \n", + " transaction_id timestamp_year timestamp_month \\\n", + "transaction_category \n", + "Uncategorized 3 2024 4 \n", + "Uncategorized 7 2024 2 \n", + "Uncategorized 11 2024 3 \n", + "Uncategorized 15 2024 1 \n", + "Uncategorized 19 2024 1 \n", + "... ... ... ... \n", + "Pension and insurances 99979 2024 3 \n", + "Pension and insurances 99983 2024 4 \n", + "Pension and insurances 99987 2024 2 \n", + "Pension and insurances 99991 2024 4 \n", + "Pension and insurances 99995 2024 2 \n", "\n", - " timestamp_day timestamp_hour timestamp_minute \\\n", - "transaction_id \n", - "3 5 6 11 \n", - "7 6 8 44 \n", - "11 15 2 15 \n", - "15 17 9 8 \n", - "19 3 5 50 \n", - "... ... ... ... \n", - "99979 7 21 46 \n", - "99983 3 8 0 \n", - "99987 10 15 33 \n", - "99991 18 8 52 \n", - "99995 11 13 22 \n", + " timestamp_day timestamp_hour timestamp_minute \\\n", + "transaction_category \n", + "Uncategorized 5 9 12 \n", + "Uncategorized 6 11 44 \n", + "Uncategorized 15 5 16 \n", + "Uncategorized 17 12 8 \n", + "Uncategorized 3 8 50 \n", + "... ... ... ... \n", + "Pension and insurances 8 0 46 \n", + "Pension and insurances 3 11 0 \n", + "Pension and insurances 10 18 34 \n", + "Pension and insurances 18 11 53 \n", + "Pension and insurances 11 16 23 \n", "\n", - " timestamp_second distance \n", - "transaction_id \n", - "3 54 439.890 \n", - "7 16 400.610 \n", - "11 48 211.655 \n", - "15 18 191.365 \n", - "19 24 55.885 \n", - "... ... ... \n", - "99979 24 151.050 \n", - "99983 9 57.945 \n", - "99987 57 103.540 \n", - "99991 34 177.790 \n", - "99995 42 102.130 \n", + " timestamp_second distance \n", + "transaction_category \n", + "Uncategorized 27 0.000000 \n", + "Uncategorized 49 39.280000 \n", + "Uncategorized 21 278.126667 \n", + "Uncategorized 51 239.030000 \n", + "Uncategorized 57 407.992000 \n", + "... ... ... \n", + "Pension and insurances 57 90.646364 \n", + "Pension and insurances 42 95.217391 \n", + "Pension and insurances 30 4.012852 \n", + "Pension and insurances 7 143.967410 \n", + "Pension and insurances 15 7.326237 \n", "\n", - "[24999 rows x 12 columns]" + "[24999 rows x 13 columns]" ] }, - "execution_count": 17, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1393,7 +1229,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", "metadata": {}, "outputs": [ @@ -1423,6 +1259,7 @@ " receiver_id\n", " sender_id\n", " amount\n", + " transaction_id\n", " timestamp_year\n", " timestamp_month\n", " timestamp_day\n", @@ -1432,7 +1269,8 @@ " distance\n", " \n", " \n", - " transaction_id\n", + " transaction_category\n", + " \n", " \n", " \n", " \n", @@ -1449,79 +1287,84 @@ " \n", " \n", " \n", - " 3\n", + " Uncategorized\n", " 0\n", - " 879.78\n", + " 879.780000\n", " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", + " 3\n", " 2024\n", " 4\n", " 5\n", - " 6\n", - " 11\n", - " 54\n", - " 439.890\n", + " 9\n", + " 12\n", + " 27\n", + " 0.000000\n", " \n", " \n", - " 7\n", + " Uncategorized\n", " 0\n", - " 801.22\n", + " 840.500000\n", " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", + " 7\n", " 2024\n", " 2\n", " 6\n", - " 8\n", + " 11\n", " 44\n", - " 16\n", - " 400.610\n", + " 49\n", + " 39.280000\n", " \n", " \n", - " 11\n", + " Uncategorized\n", " 0\n", - " 423.31\n", + " 701.436667\n", " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", + " 11\n", " 2024\n", " 3\n", " 15\n", - " 2\n", - " 15\n", - " 48\n", - " 211.655\n", + " 5\n", + " 16\n", + " 21\n", + " 278.126667\n", " \n", " \n", - " 15\n", + " Uncategorized\n", " 0\n", - " 382.73\n", + " 621.760000\n", " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", + " 15\n", " 2024\n", " 1\n", " 17\n", - " 9\n", + " 12\n", " 8\n", - " 18\n", - " 191.365\n", + " 51\n", + " 239.030000\n", " \n", " \n", - " 19\n", + " Uncategorized\n", " 0\n", - " 111.77\n", + " 519.762000\n", " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", + " 19\n", " 2024\n", " 1\n", " 3\n", - " 5\n", + " 8\n", " 50\n", - " 24\n", - " 55.885\n", + " 57\n", + " 407.992000\n", " \n", " \n", " ...\n", @@ -1537,148 +1380,168 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", - " 99979\n", + " Pension and insurances\n", " 18\n", - " 302.10\n", + " 211.453636\n", " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", + " 99979\n", " 2024\n", " 3\n", - " 7\n", - " 21\n", + " 8\n", + " 0\n", " 46\n", - " 24\n", - " 151.050\n", + " 57\n", + " 90.646364\n", " \n", " \n", - " 99983\n", + " Pension and insurances\n", " 18\n", - " 115.89\n", + " 211.107391\n", " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", + " 99983\n", " 2024\n", " 4\n", " 3\n", - " 8\n", + " 11\n", " 0\n", - " 9\n", - " 57.945\n", + " 42\n", + " 95.217391\n", " \n", " \n", - " 99987\n", + " Pension and insurances\n", " 18\n", - " 207.08\n", + " 211.092852\n", " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", + " 99987\n", " 2024\n", " 2\n", " 10\n", - " 15\n", - " 33\n", - " 57\n", - " 103.540\n", + " 18\n", + " 34\n", + " 30\n", + " 4.012852\n", " \n", " \n", - " 99991\n", + " Pension and insurances\n", " 18\n", - " 355.58\n", + " 211.612590\n", " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", + " 99991\n", " 2024\n", " 4\n", " 18\n", - " 8\n", - " 52\n", - " 34\n", - " 177.790\n", + " 11\n", + " 53\n", + " 7\n", + " 143.967410\n", " \n", " \n", - " 99995\n", + " Pension and insurances\n", " 18\n", - " 204.26\n", + " 211.586237\n", " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", + " 99995\n", " 2024\n", " 2\n", " 11\n", - " 13\n", - " 22\n", - " 42\n", - " 102.130\n", + " 16\n", + " 23\n", + " 15\n", + " 7.326237\n", " \n", " \n", "\n", - "

24999 rows × 12 columns

\n", + "

24999 rows × 13 columns

\n", "" ], "text/plain": [ - " transaction_category_mapped amount_avg_1d receiver_id \\\n", - "transaction_id \n", - "3 0 879.78 4518551904499919 \n", - "7 0 801.22 4757951915669080 \n", - "11 0 423.31 4518551904499919 \n", - "15 0 382.73 4518551904499919 \n", - "19 0 111.77 4098088980692974 \n", - "... ... ... ... \n", - "99979 18 302.10 4179606860088849 \n", - "99983 18 115.89 4751538620733305 \n", - "99987 18 207.08 4405008355220324 \n", - "99991 18 355.58 4092115788877543 \n", - "99995 18 204.26 4262047194499006 \n", + " transaction_category_mapped amount_avg_1d \\\n", + "transaction_category \n", + "Uncategorized 0 879.780000 \n", + "Uncategorized 0 840.500000 \n", + "Uncategorized 0 701.436667 \n", + "Uncategorized 0 621.760000 \n", + "Uncategorized 0 519.762000 \n", + "... ... ... \n", + "Pension and insurances 18 211.453636 \n", + "Pension and insurances 18 211.107391 \n", + "Pension and insurances 18 211.092852 \n", + "Pension and insurances 18 211.612590 \n", + "Pension and insurances 18 211.586237 \n", "\n", - " sender_id amount timestamp_year timestamp_month \\\n", - "transaction_id \n", - "3 4457298962882528 879.78 2024 4 \n", - "7 4655296518888015 801.22 2024 2 \n", - "11 4910949333064003 423.31 2024 3 \n", - "15 4415760195692405 382.73 2024 1 \n", - "19 4412940106031926 111.77 2024 1 \n", - "... ... ... ... ... \n", - "99979 4359198069543354 302.10 2024 3 \n", - "99983 4021524999937895 115.89 2024 4 \n", - "99987 4165276502284291 207.08 2024 2 \n", - "99991 4328901131757235 355.58 2024 4 \n", - "99995 4017367486513464 204.26 2024 2 \n", + " receiver_id sender_id amount \\\n", + "transaction_category \n", + "Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "Uncategorized 4757951915669080 4655296518888015 801.22 \n", + "Uncategorized 4518551904499919 4910949333064003 423.31 \n", + "Uncategorized 4518551904499919 4415760195692405 382.73 \n", + "Uncategorized 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... \n", + "Pension and insurances 4179606860088849 4359198069543354 302.10 \n", + "Pension and insurances 4751538620733305 4021524999937895 115.89 \n", + "Pension and insurances 4405008355220324 4165276502284291 207.08 \n", + "Pension and insurances 4092115788877543 4328901131757235 355.58 \n", + "Pension and insurances 4262047194499006 4017367486513464 204.26 \n", "\n", - " timestamp_day timestamp_hour timestamp_minute \\\n", - "transaction_id \n", - "3 5 6 11 \n", - "7 6 8 44 \n", - "11 15 2 15 \n", - "15 17 9 8 \n", - "19 3 5 50 \n", - "... ... ... ... \n", - "99979 7 21 46 \n", - "99983 3 8 0 \n", - "99987 10 15 33 \n", - "99991 18 8 52 \n", - "99995 11 13 22 \n", + " transaction_id timestamp_year timestamp_month \\\n", + "transaction_category \n", + "Uncategorized 3 2024 4 \n", + "Uncategorized 7 2024 2 \n", + "Uncategorized 11 2024 3 \n", + "Uncategorized 15 2024 1 \n", + "Uncategorized 19 2024 1 \n", + "... ... ... ... \n", + "Pension and insurances 99979 2024 3 \n", + "Pension and insurances 99983 2024 4 \n", + "Pension and insurances 99987 2024 2 \n", + "Pension and insurances 99991 2024 4 \n", + "Pension and insurances 99995 2024 2 \n", "\n", - " timestamp_second distance \n", - "transaction_id \n", - "3 54 439.890 \n", - "7 16 400.610 \n", - "11 48 211.655 \n", - "15 18 191.365 \n", - "19 24 55.885 \n", - "... ... ... \n", - "99979 24 151.050 \n", - "99983 9 57.945 \n", - "99987 57 103.540 \n", - "99991 34 177.790 \n", - "99995 42 102.130 \n", + " timestamp_day timestamp_hour timestamp_minute \\\n", + "transaction_category \n", + "Uncategorized 5 9 12 \n", + "Uncategorized 6 11 44 \n", + "Uncategorized 15 5 16 \n", + "Uncategorized 17 12 8 \n", + "Uncategorized 3 8 50 \n", + "... ... ... ... \n", + "Pension and insurances 8 0 46 \n", + "Pension and insurances 3 11 0 \n", + "Pension and insurances 10 18 34 \n", + "Pension and insurances 18 11 53 \n", + "Pension and insurances 11 16 23 \n", "\n", - "[24999 rows x 12 columns]" + " timestamp_second distance \n", + "transaction_category \n", + "Uncategorized 27 0.000000 \n", + "Uncategorized 49 39.280000 \n", + "Uncategorized 21 278.126667 \n", + "Uncategorized 51 239.030000 \n", + "Uncategorized 57 407.992000 \n", + "... ... ... \n", + "Pension and insurances 57 90.646364 \n", + "Pension and insurances 42 95.217391 \n", + "Pension and insurances 30 4.012852 \n", + "Pension and insurances 7 143.967410 \n", + "Pension and insurances 15 7.326237 \n", + "\n", + "[24999 rows x 13 columns]" ] }, - "execution_count": 18, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1692,40 +1555,7 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "e1d377a5-cf7e-4564-8e14-10bfbaca4da2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['transaction_category_mapped',\n", - " 'amount_avg_1d',\n", - " 'receiver_id',\n", - " 'sender_id',\n", - " 'amount',\n", - " 'timestamp_year',\n", - " 'timestamp_month',\n", - " 'timestamp_day',\n", - " 'timestamp_hour',\n", - " 'timestamp_minute',\n", - " 'timestamp_second',\n", - " 'distance']" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_cols = list(data.columns)\n", - "data_cols" - ] - }, - { - "cell_type": "code", - "execution_count": 20, + "execution_count": 25, "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", "metadata": {}, "outputs": [], @@ -1733,6 +1563,7 @@ "# Import MLRun's Feature Store\n", "import mlrun.feature_store as fstore\n", "\n", + "data_cols = list(data.columns)\n", "# create feature vector on top of aggreagations\n", "# Define the list of features we will be using\n", "features = [f\"transactions.{name}\" for name in data_cols[1:]] \n", @@ -1754,7 +1585,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "id": "eb69d9fa-22a9-4b9f-9443-d00d9190ad55", "metadata": {}, "outputs": [ @@ -1783,6 +1614,7 @@ " receiver_id\n", " sender_id\n", " amount\n", + " transaction_id\n", " timestamp_year\n", " timestamp_month\n", " timestamp_day\n", @@ -1796,77 +1628,82 @@ " \n", " \n", " 0\n", - " 879.78\n", + " 879.780000\n", " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", + " 3\n", " 2024\n", " 4\n", " 5\n", - " 6\n", - " 11\n", - " 54\n", - " 439.890\n", + " 9\n", + " 12\n", + " 27\n", + " 0.000000\n", " 0\n", " \n", " \n", " 1\n", - " 801.22\n", + " 840.500000\n", " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", + " 7\n", " 2024\n", " 2\n", " 6\n", - " 8\n", + " 11\n", " 44\n", - " 16\n", - " 400.610\n", + " 49\n", + " 39.280000\n", " 0\n", " \n", " \n", " 2\n", - " 423.31\n", + " 701.436667\n", " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", + " 11\n", " 2024\n", " 3\n", " 15\n", - " 2\n", - " 15\n", - " 48\n", - " 211.655\n", + " 5\n", + " 16\n", + " 21\n", + " 278.126667\n", " 0\n", " \n", " \n", " 3\n", - " 382.73\n", + " 621.760000\n", " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", + " 15\n", " 2024\n", " 1\n", " 17\n", - " 9\n", + " 12\n", " 8\n", - " 18\n", - " 191.365\n", + " 51\n", + " 239.030000\n", " 0\n", " \n", " \n", " 4\n", - " 111.77\n", + " 519.762000\n", " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", + " 19\n", " 2024\n", " 1\n", " 3\n", - " 5\n", + " 8\n", " 50\n", - " 24\n", - " 55.885\n", + " 57\n", + " 407.992000\n", " 0\n", " \n", " \n", @@ -1883,126 +1720,132 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 24994\n", - " 302.10\n", + " 211.453636\n", " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", + " 99979\n", " 2024\n", " 3\n", - " 7\n", - " 21\n", + " 8\n", + " 0\n", " 46\n", - " 24\n", - " 151.050\n", + " 57\n", + " 90.646364\n", " 18\n", " \n", " \n", " 24995\n", - " 115.89\n", + " 211.107391\n", " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", + " 99983\n", " 2024\n", " 4\n", " 3\n", - " 8\n", + " 11\n", " 0\n", - " 9\n", - " 57.945\n", + " 42\n", + " 95.217391\n", " 18\n", " \n", " \n", " 24996\n", - " 207.08\n", + " 211.092852\n", " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", + " 99987\n", " 2024\n", " 2\n", " 10\n", - " 15\n", - " 33\n", - " 57\n", - " 103.540\n", + " 18\n", + " 34\n", + " 30\n", + " 4.012852\n", " 18\n", " \n", " \n", " 24997\n", - " 355.58\n", + " 211.612590\n", " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", + " 99991\n", " 2024\n", " 4\n", " 18\n", - " 8\n", - " 52\n", - " 34\n", - " 177.790\n", + " 11\n", + " 53\n", + " 7\n", + " 143.967410\n", " 18\n", " \n", " \n", " 24998\n", - " 204.26\n", + " 211.586237\n", " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", + " 99995\n", " 2024\n", " 2\n", " 11\n", - " 13\n", - " 22\n", - " 42\n", - " 102.130\n", + " 16\n", + " 23\n", + " 15\n", + " 7.326237\n", " 18\n", " \n", " \n", "\n", - "

24999 rows × 12 columns

\n", + "

24999 rows × 13 columns

\n", "" ], "text/plain": [ " amount_avg_1d receiver_id sender_id amount \\\n", - "0 879.78 4518551904499919 4457298962882528 879.78 \n", - "1 801.22 4757951915669080 4655296518888015 801.22 \n", - "2 423.31 4518551904499919 4910949333064003 423.31 \n", - "3 382.73 4518551904499919 4415760195692405 382.73 \n", - "4 111.77 4098088980692974 4412940106031926 111.77 \n", + "0 879.780000 4518551904499919 4457298962882528 879.78 \n", + "1 840.500000 4757951915669080 4655296518888015 801.22 \n", + "2 701.436667 4518551904499919 4910949333064003 423.31 \n", + "3 621.760000 4518551904499919 4415760195692405 382.73 \n", + "4 519.762000 4098088980692974 4412940106031926 111.77 \n", "... ... ... ... ... \n", - "24994 302.10 4179606860088849 4359198069543354 302.10 \n", - "24995 115.89 4751538620733305 4021524999937895 115.89 \n", - "24996 207.08 4405008355220324 4165276502284291 207.08 \n", - "24997 355.58 4092115788877543 4328901131757235 355.58 \n", - "24998 204.26 4262047194499006 4017367486513464 204.26 \n", + "24994 211.453636 4179606860088849 4359198069543354 302.10 \n", + "24995 211.107391 4751538620733305 4021524999937895 115.89 \n", + "24996 211.092852 4405008355220324 4165276502284291 207.08 \n", + "24997 211.612590 4092115788877543 4328901131757235 355.58 \n", + "24998 211.586237 4262047194499006 4017367486513464 204.26 \n", "\n", - " timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", - "0 2024 4 5 6 \n", - "1 2024 2 6 8 \n", - "2 2024 3 15 2 \n", - "3 2024 1 17 9 \n", - "4 2024 1 3 5 \n", - "... ... ... ... ... \n", - "24994 2024 3 7 21 \n", - "24995 2024 4 3 8 \n", - "24996 2024 2 10 15 \n", - "24997 2024 4 18 8 \n", - "24998 2024 2 11 13 \n", + " transaction_id timestamp_year timestamp_month timestamp_day \\\n", + "0 3 2024 4 5 \n", + "1 7 2024 2 6 \n", + "2 11 2024 3 15 \n", + "3 15 2024 1 17 \n", + "4 19 2024 1 3 \n", + "... ... ... ... ... \n", + "24994 99979 2024 3 8 \n", + "24995 99983 2024 4 3 \n", + "24996 99987 2024 2 10 \n", + "24997 99991 2024 4 18 \n", + "24998 99995 2024 2 11 \n", "\n", - " timestamp_minute timestamp_second distance \\\n", - "0 11 54 439.890 \n", - "1 44 16 400.610 \n", - "2 15 48 211.655 \n", - "3 8 18 191.365 \n", - "4 50 24 55.885 \n", - "... ... ... ... \n", - "24994 46 24 151.050 \n", - "24995 0 9 57.945 \n", - "24996 33 57 103.540 \n", - "24997 52 34 177.790 \n", - "24998 22 42 102.130 \n", + " timestamp_hour timestamp_minute timestamp_second distance \\\n", + "0 9 12 27 0.000000 \n", + "1 11 44 49 39.280000 \n", + "2 5 16 21 278.126667 \n", + "3 12 8 51 239.030000 \n", + "4 8 50 57 407.992000 \n", + "... ... ... ... ... \n", + "24994 0 46 57 90.646364 \n", + "24995 11 0 42 95.217391 \n", + "24996 18 34 30 4.012852 \n", + "24997 11 53 7 143.967410 \n", + "24998 16 23 15 7.326237 \n", "\n", " transaction_category_mapped \n", "0 0 \n", @@ -2017,10 +1860,10 @@ "24997 18 \n", "24998 18 \n", "\n", - "[24999 rows x 12 columns]" + "[24999 rows x 13 columns]" ] }, - "execution_count": 21, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2035,67 +1878,35 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "id": "cb156ebe-9846-4ff3-a388-92362df7c741", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'amount_avg_1d': 10.45,\n", - " 'receiver_id': 4365439229004487,\n", - " 'sender_id': 4895135853273971,\n", - " 'amount': 10.45,\n", + "[{'amount_avg_1d': 211.58623655913973,\n", + " 'receiver_id': 4262047194499006,\n", + " 'sender_id': 4017367486513464,\n", + " 'amount': 204.26,\n", + " 'transaction_id': 99995,\n", " 'timestamp_year': 2024,\n", - " 'timestamp_month': 3,\n", - " 'timestamp_day': 20,\n", - " 'timestamp_hour': 5,\n", - " 'timestamp_minute': 57,\n", - " 'timestamp_second': 11,\n", - " 'distance': 5.225}]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "svc = transactions_fv.get_online_feature_service()\n", - "resp = svc.get([{\"transaction_id\": \"24995\"}])\n", - "resp" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "7737865e-21a1-4bfe-b24a-29925145280f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[10.45,\n", - " 4365439229004487,\n", - " 4895135853273971,\n", - " 10.45,\n", - " 2024,\n", - " 3,\n", - " 20,\n", - " 5,\n", - " 57,\n", - " 11,\n", - " 5.225]]" + " 'timestamp_month': 2,\n", + " 'timestamp_day': 11,\n", + " 'timestamp_hour': 16,\n", + " 'timestamp_minute': 23,\n", + " 'timestamp_second': 15,\n", + " 'distance': 7.326236559139744}]" ] }, - "execution_count": 23, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "svc = transactions_fv.get_online_feature_service()\n", - "resp = svc.get([{\"transaction_id\": \"24995\"}],as_list=True)\n", + "resp = svc.get([{\"transaction_category\": \"Pension and insurances\"}])\n", "resp" ] }, @@ -2138,7 +1949,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 28, "id": "1cbb00b5-46bf-4a20-aad9-a03716ab97ae", "metadata": {}, "outputs": [ @@ -2168,6 +1979,7 @@ " receiver_id\n", " sender_id\n", " amount\n", + " transaction_id\n", " timestamp_year\n", " timestamp_month\n", " timestamp_day\n", @@ -2181,77 +1993,82 @@ " \n", " 0\n", " 0\n", - " 879.78\n", + " 879.780000\n", " 4518551904499919\n", " 4457298962882528\n", " 879.78\n", + " 3\n", " 2024\n", " 4\n", " 5\n", - " 6\n", - " 11\n", - " 54\n", - " 439.890\n", + " 9\n", + " 12\n", + " 27\n", + " 0.000000\n", " \n", " \n", " 1\n", " 0\n", - " 801.22\n", + " 840.500000\n", " 4757951915669080\n", " 4655296518888015\n", " 801.22\n", + " 7\n", " 2024\n", " 2\n", " 6\n", - " 8\n", + " 11\n", " 44\n", - " 16\n", - " 400.610\n", + " 49\n", + " 39.280000\n", " \n", " \n", " 2\n", " 0\n", - " 423.31\n", + " 701.436667\n", " 4518551904499919\n", " 4910949333064003\n", " 423.31\n", + " 11\n", " 2024\n", " 3\n", " 15\n", - " 2\n", - " 15\n", - " 48\n", - " 211.655\n", + " 5\n", + " 16\n", + " 21\n", + " 278.126667\n", " \n", " \n", " 3\n", " 0\n", - " 382.73\n", + " 621.760000\n", " 4518551904499919\n", " 4415760195692405\n", " 382.73\n", + " 15\n", " 2024\n", " 1\n", " 17\n", - " 9\n", + " 12\n", " 8\n", - " 18\n", - " 191.365\n", + " 51\n", + " 239.030000\n", " \n", " \n", " 4\n", " 0\n", - " 111.77\n", + " 519.762000\n", " 4098088980692974\n", " 4412940106031926\n", " 111.77\n", + " 19\n", " 2024\n", " 1\n", " 3\n", - " 5\n", + " 8\n", " 50\n", - " 24\n", - " 55.885\n", + " 57\n", + " 407.992000\n", " \n", " \n", " ...\n", @@ -2267,144 +2084,150 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 24994\n", " 18\n", - " 302.10\n", + " 211.453636\n", " 4179606860088849\n", " 4359198069543354\n", " 302.10\n", + " 99979\n", " 2024\n", " 3\n", - " 7\n", - " 21\n", + " 8\n", + " 0\n", " 46\n", - " 24\n", - " 151.050\n", + " 57\n", + " 90.646364\n", " \n", " \n", " 24995\n", " 18\n", - " 115.89\n", + " 211.107391\n", " 4751538620733305\n", " 4021524999937895\n", " 115.89\n", + " 99983\n", " 2024\n", " 4\n", " 3\n", - " 8\n", + " 11\n", " 0\n", - " 9\n", - " 57.945\n", + " 42\n", + " 95.217391\n", " \n", " \n", " 24996\n", " 18\n", - " 207.08\n", + " 211.092852\n", " 4405008355220324\n", " 4165276502284291\n", " 207.08\n", + " 99987\n", " 2024\n", " 2\n", " 10\n", - " 15\n", - " 33\n", - " 57\n", - " 103.540\n", + " 18\n", + " 34\n", + " 30\n", + " 4.012852\n", " \n", " \n", " 24997\n", " 18\n", - " 355.58\n", + " 211.612590\n", " 4092115788877543\n", " 4328901131757235\n", " 355.58\n", + " 99991\n", " 2024\n", " 4\n", " 18\n", - " 8\n", - " 52\n", - " 34\n", - " 177.790\n", + " 11\n", + " 53\n", + " 7\n", + " 143.967410\n", " \n", " \n", " 24998\n", " 18\n", - " 204.26\n", + " 211.586237\n", " 4262047194499006\n", " 4017367486513464\n", " 204.26\n", + " 99995\n", " 2024\n", " 2\n", " 11\n", - " 13\n", - " 22\n", - " 42\n", - " 102.130\n", + " 16\n", + " 23\n", + " 15\n", + " 7.326237\n", " \n", " \n", "\n", - "

24999 rows × 12 columns

\n", + "

24999 rows × 13 columns

\n", "" ], "text/plain": [ " transaction_category_mapped amount_avg_1d receiver_id \\\n", - "0 0 879.78 4518551904499919 \n", - "1 0 801.22 4757951915669080 \n", - "2 0 423.31 4518551904499919 \n", - "3 0 382.73 4518551904499919 \n", - "4 0 111.77 4098088980692974 \n", + "0 0 879.780000 4518551904499919 \n", + "1 0 840.500000 4757951915669080 \n", + "2 0 701.436667 4518551904499919 \n", + "3 0 621.760000 4518551904499919 \n", + "4 0 519.762000 4098088980692974 \n", "... ... ... ... \n", - "24994 18 302.10 4179606860088849 \n", - "24995 18 115.89 4751538620733305 \n", - "24996 18 207.08 4405008355220324 \n", - "24997 18 355.58 4092115788877543 \n", - "24998 18 204.26 4262047194499006 \n", + "24994 18 211.453636 4179606860088849 \n", + "24995 18 211.107391 4751538620733305 \n", + "24996 18 211.092852 4405008355220324 \n", + "24997 18 211.612590 4092115788877543 \n", + "24998 18 211.586237 4262047194499006 \n", "\n", - " sender_id amount timestamp_year timestamp_month \\\n", - "0 4457298962882528 879.78 2024 4 \n", - "1 4655296518888015 801.22 2024 2 \n", - "2 4910949333064003 423.31 2024 3 \n", - "3 4415760195692405 382.73 2024 1 \n", - "4 4412940106031926 111.77 2024 1 \n", - "... ... ... ... ... \n", - "24994 4359198069543354 302.10 2024 3 \n", - "24995 4021524999937895 115.89 2024 4 \n", - "24996 4165276502284291 207.08 2024 2 \n", - "24997 4328901131757235 355.58 2024 4 \n", - "24998 4017367486513464 204.26 2024 2 \n", + " sender_id amount transaction_id timestamp_year \\\n", + "0 4457298962882528 879.78 3 2024 \n", + "1 4655296518888015 801.22 7 2024 \n", + "2 4910949333064003 423.31 11 2024 \n", + "3 4415760195692405 382.73 15 2024 \n", + "4 4412940106031926 111.77 19 2024 \n", + "... ... ... ... ... \n", + "24994 4359198069543354 302.10 99979 2024 \n", + "24995 4021524999937895 115.89 99983 2024 \n", + "24996 4165276502284291 207.08 99987 2024 \n", + "24997 4328901131757235 355.58 99991 2024 \n", + "24998 4017367486513464 204.26 99995 2024 \n", "\n", - " timestamp_day timestamp_hour timestamp_minute timestamp_second \\\n", - "0 5 6 11 54 \n", - "1 6 8 44 16 \n", - "2 15 2 15 48 \n", - "3 17 9 8 18 \n", - "4 3 5 50 24 \n", - "... ... ... ... ... \n", - "24994 7 21 46 24 \n", - "24995 3 8 0 9 \n", - "24996 10 15 33 57 \n", - "24997 18 8 52 34 \n", - "24998 11 13 22 42 \n", + " timestamp_month timestamp_day timestamp_hour timestamp_minute \\\n", + "0 4 5 9 12 \n", + "1 2 6 11 44 \n", + "2 3 15 5 16 \n", + "3 1 17 12 8 \n", + "4 1 3 8 50 \n", + "... ... ... ... ... \n", + "24994 3 8 0 46 \n", + "24995 4 3 11 0 \n", + "24996 2 10 18 34 \n", + "24997 4 18 11 53 \n", + "24998 2 11 16 23 \n", "\n", - " distance \n", - "0 439.890 \n", - "1 400.610 \n", - "2 211.655 \n", - "3 191.365 \n", - "4 55.885 \n", - "... ... \n", - "24994 151.050 \n", - "24995 57.945 \n", - "24996 103.540 \n", - "24997 177.790 \n", - "24998 102.130 \n", + " timestamp_second distance \n", + "0 27 0.000000 \n", + "1 49 39.280000 \n", + "2 21 278.126667 \n", + "3 51 239.030000 \n", + "4 57 407.992000 \n", + "... ... ... \n", + "24994 57 90.646364 \n", + "24995 42 95.217391 \n", + "24996 30 4.012852 \n", + "24997 7 143.967410 \n", + "24998 15 7.326237 \n", "\n", - "[24999 rows x 12 columns]" + "[24999 rows x 13 columns]" ] }, - "execution_count": 24, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2427,7 +2250,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 29, "id": "47512de3-60ac-49c7-ace8-031959527e86", "metadata": {}, "outputs": [], @@ -2448,7 +2271,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 30, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -2468,7 +2291,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 31, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -2494,7 +2317,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 32, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -2512,7 +2335,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 33, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -2535,7 +2358,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -2560,7 +2383,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -2588,7 +2411,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 36, "id": "c24e06fc", "metadata": { "scrolled": true @@ -2598,143 +2421,143 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-13-24-31-606\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-16-32-32-584\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-11 13:24:31 Starting - Starting the training job...\n", - "2024-02-11 13:24:45 Starting - Preparing the instances for training......\n", - "2024-02-11 13:25:57 Downloading - Downloading input data...\n", - "2024-02-11 13:26:31 Downloading - Downloading the training image......\n", - "2024-02-11 13:27:27 Training - Training image download completed. Training in progress....\n", - "2024-02-11 13:28:02 Uploading - Uploading generated training model\u001b[34m[2024-02-11 13:27:38.372 ip-10-0-187-204.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-11 16:32:32 Starting - Starting the training job...\n", + "2024-02-11 16:32:46 Starting - Preparing the instances for training......\n", + "2024-02-11 16:33:57 Downloading - Downloading input data......\n", + "2024-02-11 16:34:37 Downloading - Downloading the training image...\n", + "2024-02-11 16:35:23 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-11 16:35:36.980 ip-10-0-69-198.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Train matrix has 17499 rows and 11 columns\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Validation matrix has 5000 rows\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.503 ip-10-0-187-204.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.504 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.505 ip-10-0-187-204.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.505 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.506 ip-10-0-187-204.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-11:13:27:38:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.55820#011validation-merror:0.58340\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.726 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-11 13:27:38.729 ip-10-0-187-204.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.54935#011validation-merror:0.57620\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.53712#011validation-merror:0.56500\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.52723#011validation-merror:0.55580\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.52312#011validation-merror:0.54980\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.51586#011validation-merror:0.54260\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.50992#011validation-merror:0.53440\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.50723#011validation-merror:0.53080\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.49729#011validation-merror:0.52640\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.49163#011validation-merror:0.52160\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.48740#011validation-merror:0.51680\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.48146#011validation-merror:0.51300\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.47597#011validation-merror:0.50720\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.47151#011validation-merror:0.50540\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.46208#011validation-merror:0.49700\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.45917#011validation-merror:0.49240\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.45363#011validation-merror:0.49120\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.44460#011validation-merror:0.48360\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.44134#011validation-merror:0.47800\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.43745#011validation-merror:0.47400\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.43420#011validation-merror:0.47180\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.42848#011validation-merror:0.46740\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.42100#011validation-merror:0.46000\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.41385#011validation-merror:0.45320\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.40894#011validation-merror:0.45100\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.40339#011validation-merror:0.44400\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.39734#011validation-merror:0.43880\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.39105#011validation-merror:0.43260\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.38871#011validation-merror:0.42860\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.38511#011validation-merror:0.42520\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.38356#011validation-merror:0.42420\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.37608#011validation-merror:0.41720\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.36973#011validation-merror:0.41240\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.36471#011validation-merror:0.40640\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.36271#011validation-merror:0.40500\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.35882#011validation-merror:0.40200\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.35791#011validation-merror:0.40060\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.35253#011validation-merror:0.39520\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.34796#011validation-merror:0.39300\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.34722#011validation-merror:0.39180\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.34482#011validation-merror:0.39040\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.34196#011validation-merror:0.38640\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.33910#011validation-merror:0.38160\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.33465#011validation-merror:0.37620\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.33179#011validation-merror:0.37440\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.32893#011validation-merror:0.37380\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.32533#011validation-merror:0.37220\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.32333#011validation-merror:0.37080\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.31842#011validation-merror:0.36740\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.31476#011validation-merror:0.36140\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.31145#011validation-merror:0.35800\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.30973#011validation-merror:0.35640\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.30642#011validation-merror:0.35560\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.30476#011validation-merror:0.35540\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.30299#011validation-merror:0.35300\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.30133#011validation-merror:0.35160\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.29945#011validation-merror:0.34920\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.29545#011validation-merror:0.34740\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.29185#011validation-merror:0.34100\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.29013#011validation-merror:0.33840\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.28670#011validation-merror:0.33640\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.28047#011validation-merror:0.33180\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.27927#011validation-merror:0.33040\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.27584#011validation-merror:0.32720\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.27264#011validation-merror:0.32340\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.26773#011validation-merror:0.31760\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.26459#011validation-merror:0.31400\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.26407#011validation-merror:0.31340\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.26219#011validation-merror:0.31160\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.25899#011validation-merror:0.30900\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.25727#011validation-merror:0.30800\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.25533#011validation-merror:0.30580\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.25350#011validation-merror:0.30440\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.25310#011validation-merror:0.30260\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.25001#011validation-merror:0.29880\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.24653#011validation-merror:0.29380\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.24350#011validation-merror:0.29340\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.24104#011validation-merror:0.29100\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.23824#011validation-merror:0.28880\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.23681#011validation-merror:0.28700\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.23367#011validation-merror:0.28320\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.23219#011validation-merror:0.28320\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.23104#011validation-merror:0.28120\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.22939#011validation-merror:0.27940\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.22830#011validation-merror:0.27800\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.22738#011validation-merror:0.27700\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.22647#011validation-merror:0.27640\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.22618#011validation-merror:0.27560\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.22464#011validation-merror:0.27580\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.22310#011validation-merror:0.27140\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.22093#011validation-merror:0.27080\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.22007#011validation-merror:0.26900\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.21796#011validation-merror:0.26780\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.21681#011validation-merror:0.26580\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.21590#011validation-merror:0.26480\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.21350#011validation-merror:0.26280\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.21001#011validation-merror:0.26200\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.20744#011validation-merror:0.25940\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.20601#011validation-merror:0.25780\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.20481#011validation-merror:0.25620\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Train matrix has 17499 rows and 12 columns\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Validation matrix has 5000 rows\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.099 ip-10-0-69-198.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.100 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.101 ip-10-0-69-198.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.00166#011validation-merror:0.00380\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.268 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.271 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.00126#011validation-merror:0.00320\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.00103#011validation-merror:0.00260\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.00046#011validation-merror:0.00180\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.00029#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.00040#011validation-merror:0.00160\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.00017#011validation-merror:0.00100\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", "\n", - "2024-02-11 13:28:18 Completed - Training job completed\n", - "Training seconds: 142\n", - "Billable seconds: 142\n" + "2024-02-11 16:35:53 Uploading - Uploading generated training model\n", + "2024-02-11 16:36:04 Completed - Training job completed\n", + "Training seconds: 127\n", + "Billable seconds: 127\n" ] } ], @@ -2754,17 +2577,17 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-13-24-31-606/output/model.tar.gz'" + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz'" ] }, - "execution_count": 33, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -2775,7 +2598,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 38, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -2785,7 +2608,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 39, "id": "911457fa-812d-4991-a31c-4dfcb1593d3e", "metadata": {}, "outputs": [], @@ -2807,7 +2630,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 40, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, "outputs": [ @@ -2858,10 +2681,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 61, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2881,7 +2704,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 41, "id": "0ab0bcd2-5c70-4f48-bff9-d060f027e8e5", "metadata": {}, "outputs": [ @@ -2889,8 +2712,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 14:19:33,242 [info] model xgboost-model was loaded\n", - "> 2024-02-11 14:19:33,243 [info] Loaded ['xgboost-model']\n" + "> 2024-02-11 16:36:45,242 [info] model xgboost-model was loaded\n", + "> 2024-02-11 16:36:45,243 [info] Loaded ['xgboost-model']\n" ] } ], @@ -2900,7 +2723,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 43, "id": "dd57cfcd-5878-4775-83ee-422dc2261ce8", "metadata": {}, "outputs": [ @@ -2908,48 +2731,48 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'inputs': [[10.45, 4365439229004487, 4895135853273971, 10.45, 2024, 3, 20, 5, 57, 11, 5.225]]}\n" + "{'inputs': [[211.58623655913973, 4262047194499006, 4017367486513464, 204.26, 99995, 2024, 2, 11, 16, 23, 15, 7.326236559139744]]}\n" ] }, { "data": { "text/plain": [ - "{'id': '4674ea5dbf944924b1b17c6fba1fb6e9',\n", + "{'id': '9fca777838b34ecaa9a0978beb4c3324',\n", " 'model_name': 'xgboost-model',\n", - " 'outputs': [[0.0007725695031695068,\n", - " 0.18554285168647766,\n", - " 0.0005761922220699489,\n", - " 0.421852707862854,\n", - " 0.007665876764804125,\n", - " 0.012660829350352287,\n", - " 0.22086532413959503,\n", - " 0.026935890316963196,\n", - " 0.0004489392740651965,\n", - " 0.0073969378136098385,\n", - " 0.08978493511676788,\n", - " 0.0005328097031451762,\n", - " 0.020548472180962563,\n", - " 0.0011180019937455654,\n", - " 0.0005254872958175838,\n", - " 0.00045757496263831854,\n", - " 0.0003292290784884244,\n", - " 0.0009559299796819687,\n", - " 0.0010294184321537614]]}" + " 'outputs': [[0.0006098453304730356,\n", + " 0.000491024402435869,\n", + " 0.0005141795263625681,\n", + " 0.0007783450419083238,\n", + " 0.0007057395414449275,\n", + " 0.0006167895044200122,\n", + " 0.0008293685968965292,\n", + " 0.0007642377750016749,\n", + " 0.0004749966901727021,\n", + " 0.0009146890370175242,\n", + " 0.0023798206821084023,\n", + " 0.0007584734121337533,\n", + " 0.0005588593194261193,\n", + " 0.0018726944690570235,\n", + " 0.001147600938566029,\n", + " 0.0010383735643699765,\n", + " 0.0010812224354594946,\n", + " 0.006694969721138477,\n", + " 0.9777687788009644]]}" ] }, - "execution_count": 64, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "response = server.test(body={'inputs':[[24995]]})\n", + "response = server.test(body={'inputs':[['Pension and insurances']]})\n", "response" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 44, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, "outputs": [ @@ -2957,14 +2780,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 14:20:02,832 [info] Starting remote function deploy\n", - "2024-02-11 14:20:03 (info) Deploying function\n", - "2024-02-11 14:20:03 (info) Building\n", - "2024-02-11 14:20:03 (info) Staging files and preparing base images\n", - "2024-02-11 14:20:03 (info) Building processor image\n", - "2024-02-11 14:21:09 (info) Build complete\n", - "2024-02-11 14:21:17 (info) Function deploy complete\n", - "> 2024-02-11 14:21:24,362 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-v3-admin-serving-v2.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" + "> 2024-02-11 16:39:18,386 [info] Starting remote function deploy\n", + "2024-02-11 16:39:18 (info) Deploying function\n", + "2024-02-11 16:39:18 (info) Building\n", + "2024-02-11 16:39:19 (info) Staging files and preparing base images\n", + "2024-02-11 16:39:19 (info) Building processor image\n", + "2024-02-11 16:40:24 (info) Build complete\n", + "2024-02-11 16:40:33 (info) Function deploy complete\n", + "> 2024-02-11 16:40:40,471 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-v3-admin-serving-v2.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" ] }, { @@ -2973,7 +2796,7 @@ "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-v3-admin-serving-v2'})" ] }, - "execution_count": 65, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -2984,17 +2807,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c858e3e9-9e43-4148-8015-6047565db456", - "metadata": {}, - "outputs": [], - "source": [ - "#samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 66, + "execution_count": 46, "id": "ac19dc03-01e2-4e29-ba75-a34804833d5c", "metadata": {}, "outputs": [], @@ -3004,7 +2817,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 47, "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", "metadata": {}, "outputs": [ @@ -3012,47 +2825,47 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 14:22:55,332 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//v2/models/xgboost-model/predict'}\n" + "> 2024-02-11 16:40:40,546 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//v2/models/xgboost-model/predict'}\n" ] } ], "source": [ - "response = serving_function_v2.invoke(path='/v2/models/xgboost-model/predict', body={\"inputs\": [[24995]]})" + "response = serving_function_v2.invoke(path='/v2/models/xgboost-model/predict', body={\"inputs\": [['Pension and insurances']]})" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 48, "id": "57eeaddc-654a-41d2-bb51-4a9a787a3311", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': '56b180c3-2e57-4bae-89cb-92b2c31f5c9b',\n", + "{'id': '641c5971-c881-4d56-a326-8b02900be8db',\n", " 'model_name': 'xgboost-model',\n", - " 'outputs': [[0.0007725695031695068,\n", - " 0.18554285168647766,\n", - " 0.0005761922220699489,\n", - " 0.421852707862854,\n", - " 0.007665876764804125,\n", - " 0.012660829350352287,\n", - " 0.22086532413959503,\n", - " 0.026935890316963196,\n", - " 0.0004489392740651965,\n", - " 0.0073969378136098385,\n", - " 0.08978493511676788,\n", - " 0.0005328097031451762,\n", - " 0.020548472180962563,\n", - " 0.0011180019937455654,\n", - " 0.0005254872958175838,\n", - " 0.00045757496263831854,\n", - " 0.0003292290784884244,\n", - " 0.0009559299796819687,\n", - " 0.0010294184321537614]]}" + " 'outputs': [[0.0006098453304730356,\n", + " 0.000491024402435869,\n", + " 0.0005141795263625681,\n", + " 0.0007783450419083238,\n", + " 0.0007057395414449275,\n", + " 0.0006167895044200122,\n", + " 0.0008293685968965292,\n", + " 0.0007642377750016749,\n", + " 0.0004749966901727021,\n", + " 0.0009146890370175242,\n", + " 0.0023798206821084023,\n", + " 0.0007584734121337533,\n", + " 0.0005588593194261193,\n", + " 0.0018726944690570235,\n", + " 0.001147600938566029,\n", + " 0.0010383735643699765,\n", + " 0.0010812224354594946,\n", + " 0.006694969721138477,\n", + " 0.9777687788009644]]}" ] }, - "execution_count": 71, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -3081,7 +2894,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 49, "id": "2e863ea7-5804-4637-b677-390c305cabfe", "metadata": {}, "outputs": [], @@ -3099,7 +2912,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 50, "id": "ca4f7e49", "metadata": {}, "outputs": [], @@ -3117,7 +2930,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 51, "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", "metadata": {}, "outputs": [ @@ -3125,9 +2938,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 14:25:34,435 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '4e422c0c56c049469ee777c86b3bde01', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", - "> 2024-02-11 14:25:34,785 [info] Job is running in the background, pod: evaluate-evaluate-ppcb5\n", - "[14:25:39] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", + "> 2024-02-11 16:40:40,769 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '77fb208c0816491e9f7ae69634b31b1b', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", + "> 2024-02-11 16:40:41,204 [info] Job is running in the background, pod: evaluate-evaluate-x5mzk\n", + "[16:40:45] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", "configuration generated by an older version of XGBoost, please export the model by calling\n", "`Booster.save_model` from that version first, then load it back in current version. See:\n", "\n", @@ -3135,9 +2948,9 @@ "\n", "for more details about differences between saving model and serializing.\n", "\n", - "> 2024-02-11 14:25:39,434 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 4e422c0c56c049469ee777c86b3bde01 -p sagemaker-v3-admin', 'logs_cmd': 'mlrun logs 4e422c0c56c049469ee777c86b3bde01 -p sagemaker-v3-admin'}\n", - "> 2024-02-11 14:25:39,434 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-v3-admin/jobs/monitor/4e422c0c56c049469ee777c86b3bde01/overview'}\n", - "> 2024-02-11 14:25:39,435 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + "> 2024-02-11 16:40:46,110 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin', 'logs_cmd': 'mlrun logs 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin'}\n", + "> 2024-02-11 16:40:46,110 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-v3-admin/jobs/monitor/77fb208c0816491e9f7ae69634b31b1b/overview'}\n", + "> 2024-02-11 16:40:46,110 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" ] }, { @@ -3311,26 +3124,26 @@ " \n", " \n", " sagemaker-v3-admin\n", - " \n", + " \n", " 0\n", - " Feb 11 14:25:37\n", + " Feb 11 16:40:44\n", " completed\n", " evaluate-evaluate\n", - "
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-ppcb5
\n", + "
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-x5mzk
\n", " \n", - "
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-13-24-31-606/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category_mapped
factorize_key={'Uncategorized': 0, 'Entertainment': 1, 'Education': 2, 'Shopping': 3, 'Personal Care': 4, 'Health and Fitness': 5, 'Food and Dining': 6, 'Gifts and Donations': 7, 'Investments': 8, 'Bills and Utilities': 9, 'Auto and Transport': 10, 'Travel': 11, 'Fees and Charges': 12, 'Business Services': 13, 'Personal Services': 14, 'Taxes': 15, 'Gambling': 16, 'Home': 17, 'Pension and insurances': 18}
\n", + "
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category_mapped
factorize_key={'Uncategorized': 0, 'Entertainment': 1, 'Education': 2, 'Shopping': 3, 'Personal Care': 4, 'Health and Fitness': 5, 'Food and Dining': 6, 'Gifts and Donations': 7, 'Investments': 8, 'Bills and Utilities': 9, 'Auto and Transport': 10, 'Travel': 11, 'Fees and Charges': 12, 'Business Services': 13, 'Personal Services': 14, 'Taxes': 15, 'Gambling': 16, 'Home': 17, 'Pension and insurances': 18}
\n", " \n", "
classification_report
\n", " \n", " \n", "\n", "\n", - "
\n", + "
\n", "
\n", - " Title\n", - " ×\n", + " Title\n", + " ×\n", "
\n", - " \n", + " \n", "
\n", "
\n" ], @@ -3351,7 +3164,7 @@ { "data": { "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" + " > to track results use the .show() or .logs() methods or click here to open in UI" ], "text/plain": [ "" @@ -3364,7 +3177,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-11 14:25:45,077 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + "> 2024-02-11 16:40:54,323 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" ] } ], @@ -3391,7 +3204,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 52, "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", "metadata": {}, "outputs": [ @@ -3425,156 +3238,156 @@ " \n", " \n", " Uncategorized\n", - " 0.833333\n", - " 0.625000\n", - " 0.714286\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 8.0000\n", " \n", " \n", " Entertainment\n", - " 0.680995\n", - " 0.831492\n", - " 0.748756\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 362.0000\n", " \n", " \n", " Education\n", - " 0.923077\n", - " 0.857143\n", - " 0.888889\n", + " 1.000000\n", + " 0.928571\n", + " 0.962963\n", " 14.0000\n", " \n", " \n", " Shopping\n", - " 0.710200\n", - " 0.866279\n", - " 0.780513\n", + " 0.998839\n", + " 1.000000\n", + " 0.999419\n", " 860.0000\n", " \n", " \n", " Personal Care\n", - " 0.880000\n", - " 0.956522\n", - " 0.916667\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 23.0000\n", " \n", " \n", " Health and Fitness\n", - " 0.875000\n", - " 0.569106\n", - " 0.689655\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 123.0000\n", " \n", " \n", " Food and Dining\n", - " 0.845161\n", - " 0.603687\n", - " 0.704301\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 217.0000\n", " \n", " \n", " Gifts and Donations\n", - " 0.738095\n", - " 0.455882\n", - " 0.563636\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 68.0000\n", " \n", " \n", " Investments\n", + " 0.916667\n", " 1.000000\n", - " 0.909091\n", - " 0.952381\n", + " 0.956522\n", " 22.0000\n", " \n", " \n", " Bills and Utilities\n", - " 0.935484\n", - " 0.630435\n", - " 0.753247\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 92.0000\n", " \n", " \n", " Auto and Transport\n", - " 0.758373\n", - " 0.667368\n", - " 0.709966\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 475.0000\n", " \n", " \n", " Travel\n", - " 0.833333\n", - " 0.694444\n", - " 0.757576\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 36.0000\n", " \n", " \n", " Fees and Charges\n", " 1.000000\n", - " 0.827586\n", - " 0.905660\n", + " 1.000000\n", + " 1.000000\n", " 29.0000\n", " \n", " \n", " Business Services\n", - " 0.969697\n", - " 0.680851\n", - " 0.800000\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 47.0000\n", " \n", " \n", " Personal Services\n", - " 0.894737\n", - " 0.708333\n", - " 0.790698\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 24.0000\n", " \n", " \n", " Taxes\n", - " 0.900000\n", - " 0.692308\n", - " 0.782609\n", + " 1.000000\n", + " 0.846154\n", + " 0.916667\n", " 13.0000\n", " \n", " \n", " Gambling\n", " 1.000000\n", - " 0.750000\n", - " 0.857143\n", + " 1.000000\n", + " 1.000000\n", " 8.0000\n", " \n", " \n", " Home\n", - " 0.886364\n", - " 0.750000\n", - " 0.812500\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 52.0000\n", " \n", " \n", " Pension and insurances\n", - " 0.863636\n", - " 0.703704\n", - " 0.775510\n", + " 1.000000\n", + " 1.000000\n", + " 1.000000\n", " 27.0000\n", " \n", " \n", " accuracy\n", - " 0.753200\n", - " 0.753200\n", - " 0.753200\n", - " 0.7532\n", + " 0.998800\n", + " 0.998800\n", + " 0.998800\n", + " 0.9988\n", " \n", " \n", " macro avg\n", - " 0.869868\n", - " 0.725223\n", - " 0.784421\n", + " 0.995553\n", + " 0.988143\n", + " 0.991346\n", " 2500.0000\n", " \n", " \n", " weighted avg\n", - " 0.768715\n", - " 0.753200\n", - " 0.750136\n", + " 0.998867\n", + " 0.998800\n", + " 0.998777\n", " 2500.0000\n", " \n", " \n", @@ -3583,31 +3396,31 @@ ], "text/plain": [ " precision recall f1-score support\n", - "Uncategorized 0.833333 0.625000 0.714286 8.0000\n", - "Entertainment 0.680995 0.831492 0.748756 362.0000\n", - "Education 0.923077 0.857143 0.888889 14.0000\n", - "Shopping 0.710200 0.866279 0.780513 860.0000\n", - "Personal Care 0.880000 0.956522 0.916667 23.0000\n", - "Health and Fitness 0.875000 0.569106 0.689655 123.0000\n", - "Food and Dining 0.845161 0.603687 0.704301 217.0000\n", - "Gifts and Donations 0.738095 0.455882 0.563636 68.0000\n", - "Investments 1.000000 0.909091 0.952381 22.0000\n", - "Bills and Utilities 0.935484 0.630435 0.753247 92.0000\n", - "Auto and Transport 0.758373 0.667368 0.709966 475.0000\n", - "Travel 0.833333 0.694444 0.757576 36.0000\n", - "Fees and Charges 1.000000 0.827586 0.905660 29.0000\n", - "Business Services 0.969697 0.680851 0.800000 47.0000\n", - "Personal Services 0.894737 0.708333 0.790698 24.0000\n", - "Taxes 0.900000 0.692308 0.782609 13.0000\n", - "Gambling 1.000000 0.750000 0.857143 8.0000\n", - "Home 0.886364 0.750000 0.812500 52.0000\n", - "Pension and insurances 0.863636 0.703704 0.775510 27.0000\n", - "accuracy 0.753200 0.753200 0.753200 0.7532\n", - "macro avg 0.869868 0.725223 0.784421 2500.0000\n", - "weighted avg 0.768715 0.753200 0.750136 2500.0000" + "Uncategorized 1.000000 1.000000 1.000000 8.0000\n", + "Entertainment 1.000000 1.000000 1.000000 362.0000\n", + "Education 1.000000 0.928571 0.962963 14.0000\n", + "Shopping 0.998839 1.000000 0.999419 860.0000\n", + "Personal Care 1.000000 1.000000 1.000000 23.0000\n", + "Health and Fitness 1.000000 1.000000 1.000000 123.0000\n", + "Food and Dining 1.000000 1.000000 1.000000 217.0000\n", + "Gifts and Donations 1.000000 1.000000 1.000000 68.0000\n", + "Investments 0.916667 1.000000 0.956522 22.0000\n", + "Bills and Utilities 1.000000 1.000000 1.000000 92.0000\n", + "Auto and Transport 1.000000 1.000000 1.000000 475.0000\n", + "Travel 1.000000 1.000000 1.000000 36.0000\n", + "Fees and Charges 1.000000 1.000000 1.000000 29.0000\n", + "Business Services 1.000000 1.000000 1.000000 47.0000\n", + "Personal Services 1.000000 1.000000 1.000000 24.0000\n", + "Taxes 1.000000 0.846154 0.916667 13.0000\n", + "Gambling 1.000000 1.000000 1.000000 8.0000\n", + "Home 1.000000 1.000000 1.000000 52.0000\n", + "Pension and insurances 1.000000 1.000000 1.000000 27.0000\n", + "accuracy 0.998800 0.998800 0.998800 0.9988\n", + "macro avg 0.995553 0.988143 0.991346 2500.0000\n", + "weighted avg 0.998867 0.998800 0.998777 2500.0000" ] }, - "execution_count": 80, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } From 4c1acb1c17ff595b83744ff04eba027fdfcb0230 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 12 Feb 2024 10:07:16 +0000 Subject: [PATCH 11/16] implementation with iguazio feature store --- ...ncial_payment_classification_with_fs.ipynb | 2911 +++++++++++++++++ 1 file changed, 2911 insertions(+) create mode 100644 financial_payment_classification_with_fs.ipynb diff --git a/financial_payment_classification_with_fs.ipynb b/financial_payment_classification_with_fs.ipynb new file mode 100644 index 0000000..7a5e1d6 --- /dev/null +++ b/financial_payment_classification_with_fs.ipynb @@ -0,0 +1,2911 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01b5c703", + "metadata": {}, + "source": [ + "# SageMaker Payment Classification \n" + ] + }, + { + "cell_type": "markdown", + "id": "6498f087", + "metadata": {}, + "source": [ + "---\n", + "\n", + "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", + "\n", + "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "c2e49281", + "metadata": {}, + "source": [ + "\n", + "## Background \n", + "\n", + "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", + "\n", + "\n", + "## Notebook overview \n", + "\n", + "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", + "\n", + "## Dataset \n", + "\n", + "For this notebook we use a synthetic dataset. This dataset has the following features \n", + "\n", + "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", + "\n", + " 'Uncategorized', 'Entertainment', 'Education',\n", + " 'Shopping', 'Personal Care', 'Health and Fitness',\n", + " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", + " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", + " 'Fees and Charges', 'Business Services', 'Personal Services',\n", + " 'Taxes', 'Gambling', 'Home',\n", + " 'Pension and insurances'\n", + "\n", + "\n", + "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", + "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", + "* __amount__: the amount which is transferred.\n", + "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", + "\n", + "\n", + "### 1. Setup \n", + "\n", + "Before we start we need to update the sagemaker library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fff19d6b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# import sys\n", + "# !{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", + "# !{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", + "# !{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 16:21:51,304 [info] Identified pre-initialized git repo, using it: {'url': 'git://github.com/aviaIguazio/demo-sagemaker.git#refs/heads/development'}\n", + "> 2024-02-11 16:22:06,708 [info] Created and saved project: {'name': 'sagemaker-v3-admin', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n", + "> 2024-02-11 16:22:07,592 [info] Project created successfully: {'project_name': 'sagemaker-v3', 'stored_in_db': True}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"sagemaker-v3\", \n", + " user_project=True,\n", + " parameters={\n", + " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", + " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b17a94d", + "metadata": {}, + "source": [ + "Now that we have the latest version we can import the libraries that we'll use in this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "42c5d6d0", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import io\n", + "import sagemaker\n", + "import time\n", + "import os\n", + "from sklearn.metrics import classification_report\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime, timedelta" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", + "metadata": {}, + "outputs": [], + "source": [ + "sess = sagemaker.Session()\n", + "write_bucket = sess.default_bucket()\n", + "write_prefix = \"sagemaker-app-lab\"" + ] + }, + { + "cell_type": "markdown", + "id": "3af7c33d", + "metadata": {}, + "source": [ + "Let's set the session variables to ensure that SageMaker is configured correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c0e4db17", + "metadata": {}, + "outputs": [], + "source": [ + "region = sagemaker.Session().boto_region_name\n", + "sm_client = boto3.client(\"sagemaker\")\n", + "boto_session = boto3.Session(region_name=region)\n", + "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", + "role = sagemaker_role\n", + "bucket_prefix = \"payment-classification\"\n", + "s3_bucket = sagemaker_session.default_bucket()" + ] + }, + { + "cell_type": "markdown", + "id": "4fe6a975", + "metadata": {}, + "source": [ + "We define the factorize key which is used to map the '__transaction_category__' to numeric values" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "43946b9f", + "metadata": {}, + "outputs": [], + "source": [ + "factorize_key = {\n", + " \"Uncategorized\": 0,\n", + " \"Entertainment\": 1,\n", + " \"Education\": 2,\n", + " \"Shopping\": 3,\n", + " \"Personal Care\": 4,\n", + " \"Health and Fitness\": 5,\n", + " \"Food and Dining\": 6,\n", + " \"Gifts and Donations\": 7,\n", + " \"Investments\": 8,\n", + " \"Bills and Utilities\": 9,\n", + " \"Auto and Transport\": 10,\n", + " \"Travel\": 11,\n", + " \"Fees and Charges\": 12,\n", + " \"Business Services\": 13,\n", + " \"Personal Services\": 14,\n", + " \"Taxes\": 15,\n", + " \"Gambling\": 16,\n", + " \"Home\": 17,\n", + " \"Pension and insurances\": 18,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5e3dc3c4", + "metadata": {}, + "source": [ + "### 2. Data preparation \n", + "\n", + "We ingest the simulated data from the public SageMaker S3 training database:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5ff0d280", + "metadata": {}, + "outputs": [], + "source": [ + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " f\"sagemaker-example-files-prod-{region}\",\n", + " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", + " \"financial_transactions_mini.csv\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "08578d93", + "metadata": {}, + "source": [ + "Let's start by loading the dataset from our csv file into a Pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a477abd7", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\n", + " \"financial_transactions_mini.csv\",\n", + " parse_dates=[\"timestamp\"],\n", + " infer_datetime_format=True,\n", + " dtype={\"transaction_category\": \"string\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cf6be447", + "metadata": {}, + "source": [ + "The dataframe looks as follows:\n", + "\n", + "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", + "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", + "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", + "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", + "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", + "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", + "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", + "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", + "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", + "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", + "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", + "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8c15f00d-8f89-41ec-aa22-f23fc394d1b4", + "metadata": {}, + "outputs": [], + "source": [ + "from utils import update_timestamps\n", + "data=update_timestamps(data)" + ] + }, + { + "cell_type": "markdown", + "id": "b5492919", + "metadata": {}, + "source": [ + "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", + "metadata": {}, + "outputs": [], + "source": [ + "data['transaction_id']= data.reset_index().index " + ] + }, + { + "cell_type": "markdown", + "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", + "metadata": { + "tags": [] + }, + "source": [ + "### 3. Create feature store \n", + "\n", + "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " + ] + }, + { + "cell_type": "markdown", + "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", + "metadata": {}, + "source": [ + "#### feature-group-payment-classification" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", + "metadata": {}, + "outputs": [], + "source": [ + "# move category to the first column to match sagemaker label train convention\"\n", + "def pop_and_move_to_start(d, key):\n", + " # Pop the item if it exists, otherwise return None\n", + " value = d.pop(key, None)\n", + " if value is not None:\n", + " # Move the popped item to the start\n", + " d = {key: value, **d}\n", + " return d\n", + "\n", + "def calculate_category_distance(event): \n", + " column_name ='transaction_category_mapped'\n", + " event = pop_and_move_to_start(event,column_name)\n", + " category = event[column_name]\n", + " event['distance'] = abs(event['amount']-event['amount_avg_1d']) \n", + " return event" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4101c303-2da3-431b-9375-9fa1747070af", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "DateExtractor\n", + "\n", + "DateExtractor\n", + "\n", + "\n", + "\n", + "_start->DateExtractor\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "MapValues\n", + "\n", + "MapValues\n", + "\n", + "\n", + "\n", + "DateExtractor->MapValues\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Aggregates\n", + "\n", + "Aggregates\n", + "\n", + "\n", + "\n", + "MapValues->Aggregates\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "calculate_category_distance\n", + "\n", + "calculate_category_distance\n", + "\n", + "\n", + "\n", + "Aggregates->calculate_category_distance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "DropFeatures\n", + "\n", + "DropFeatures\n", + "\n", + "\n", + "\n", + "calculate_category_distance->DropFeatures\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "parquet/parquet\n", + "\n", + "\n", + "parquet\n", + "\n", + "\n", + "\n", + "DropFeatures->parquet/parquet\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "nosql/nosql\n", + "\n", + "\n", + "nosql\n", + "\n", + "\n", + "\n", + "DropFeatures->nosql/nosql\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fstore\n", + "from mlrun.feature_store.steps import OneHotEncoder, MapValues, DateExtractor, DropFeatures\n", + "\n", + "# creating feature set\n", + "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", + " entities=[fstore.Entity(\"transaction_category\")],\n", + " description=\"transactions feature set\")\n", + "# setting up the graph\n", + "extended_transactions_set.graph \\\n", + " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp')) \\\n", + " .to(MapValues({'transaction_category' : factorize_key}, with_original_features=True)) \\\n", + "\n", + "extended_transactions_set.add_aggregation(name='amount',column='amount',operations=['avg'],windows=['1d'],period='1h')\n", + "\n", + "extended_transactions_set.graph \\\n", + " .to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates') \\\n", + " .to(DropFeatures(features=['timestamp']))\n", + "\n", + "\n", + "extended_transactions_set.set_targets()\n", + "\n", + "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53eb2151-447a-4eb0-be7f-a07f1cbea32d", + "metadata": {}, + "outputs": [], + "source": [ + "# Keeping every second row\n", + "df_kept = data.iloc[::2]\n", + "\n", + "# Or, to explicitly remove every second row (the opposite selection)\n", + "df_removed = data.drop(data.index[::2])\n", + "\n", + "\n", + "# Keeping every second row\n", + "df_kept = df_removed.iloc[::2]\n", + "\n", + "# Or, to explicitly remove every second row (the opposite selection)\n", + "df_removed_v2 = df_removed.drop(df_removed.index[::2])\n", + "\n", + "df_removed_v2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06c03ea5-8394-44ff-b81d-755e1c244269", + "metadata": {}, + "outputs": [], + "source": [ + "from utils import update_timestamps\n", + "df_removed_v2=update_timestamps(df_removed_v2)\n", + "df_removed_v2" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_category
Uncategorized0879.78000045185519044999194457298962882528879.783202445912270.000000
Uncategorized0840.50000047579519156690804655296518888015801.22720242611444939.280000
Uncategorized0701.43666745185519044999194910949333064003423.3111202431551621278.126667
Uncategorized0621.76000045185519044999194415760195692405382.7315202411712851239.030000
Uncategorized0519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
Pension and insurances18211.45363641796068600888494359198069543354302.10999792024380465790.646364
Pension and insurances18211.10739147515386207333054021524999937895115.89999832024431104295.217391
Pension and insurances18211.09285244050083552203244165276502284291207.089998720242101834304.012852
Pension and insurances18211.61259040921157888775434328901131757235355.5899991202441811537143.967410
Pension and insurances18211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", + "

24999 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category_mapped amount_avg_1d \\\n", + "transaction_category \n", + "Uncategorized 0 879.780000 \n", + "Uncategorized 0 840.500000 \n", + "Uncategorized 0 701.436667 \n", + "Uncategorized 0 621.760000 \n", + "Uncategorized 0 519.762000 \n", + "... ... ... \n", + "Pension and insurances 18 211.453636 \n", + "Pension and insurances 18 211.107391 \n", + "Pension and insurances 18 211.092852 \n", + "Pension and insurances 18 211.612590 \n", + "Pension and insurances 18 211.586237 \n", + "\n", + " receiver_id sender_id amount \\\n", + "transaction_category \n", + "Uncategorized 4518551904499919 4457298962882528 879.78 \n", + "Uncategorized 4757951915669080 4655296518888015 801.22 \n", + "Uncategorized 4518551904499919 4910949333064003 423.31 \n", + "Uncategorized 4518551904499919 4415760195692405 382.73 \n", + "Uncategorized 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... \n", + "Pension and insurances 4179606860088849 4359198069543354 302.10 \n", + "Pension and insurances 4751538620733305 4021524999937895 115.89 \n", + "Pension and insurances 4405008355220324 4165276502284291 207.08 \n", + "Pension and insurances 4092115788877543 4328901131757235 355.58 \n", + "Pension and insurances 4262047194499006 4017367486513464 204.26 \n", + "\n", + " transaction_id timestamp_year timestamp_month \\\n", + "transaction_category \n", + "Uncategorized 3 2024 4 \n", + "Uncategorized 7 2024 2 \n", + "Uncategorized 11 2024 3 \n", + "Uncategorized 15 2024 1 \n", + "Uncategorized 19 2024 1 \n", + "... ... ... ... \n", + "Pension and insurances 99979 2024 3 \n", + "Pension and insurances 99983 2024 4 \n", + "Pension and insurances 99987 2024 2 \n", + "Pension and insurances 99991 2024 4 \n", + "Pension and insurances 99995 2024 2 \n", + "\n", + " timestamp_day timestamp_hour timestamp_minute \\\n", + "transaction_category \n", + "Uncategorized 5 9 12 \n", + "Uncategorized 6 11 44 \n", + "Uncategorized 15 5 16 \n", + "Uncategorized 17 12 8 \n", + "Uncategorized 3 8 50 \n", + "... ... ... ... \n", + "Pension and insurances 8 0 46 \n", + "Pension and insurances 3 11 0 \n", + "Pension and insurances 10 18 34 \n", + "Pension and insurances 18 11 53 \n", + "Pension and insurances 11 16 23 \n", + "\n", + " timestamp_second distance \n", + "transaction_category \n", + "Uncategorized 27 0.000000 \n", + "Uncategorized 49 39.280000 \n", + "Uncategorized 21 278.126667 \n", + "Uncategorized 51 239.030000 \n", + "Uncategorized 57 407.992000 \n", + "... ... ... \n", + "Pension and insurances 57 90.646364 \n", + "Pension and insurances 42 95.217391 \n", + "Pension and insurances 30 4.012852 \n", + "Pension and insurances 7 143.967410 \n", + "Pension and insurances 15 7.326237 \n", + "\n", + "[24999 rows x 13 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ingested_data = extended_transactions_set.ingest(df_removed_v2, overwrite=True)\n", + "ingested_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", + "metadata": {}, + "outputs": [], + "source": [ + "#data = ingested_data.reset_index(drop=True)\n", + "data = ingested_data\n", + "#data = data[['transaction_category'] + [col for col in data.columns if col != 'transaction_category']]\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", + "metadata": {}, + "outputs": [], + "source": [ + "# Import MLRun's Feature Store\n", + "import mlrun.feature_store as fstore\n", + "\n", + "data_cols = list(data.columns)\n", + "# create feature vector on top of aggreagations\n", + "# Define the list of features we will be using\n", + "features = [f\"transactions.{name}\" for name in data_cols[1:]] \n", + "\n", + "\n", + "# Define the feature vector name for future reference\n", + "fv_name = 'transactions-vector'\n", + "\n", + "# Define the feature vector using our Feature Store (fstore)\n", + "transactions_fv = fstore.FeatureVector(fv_name, \n", + " features,\n", + " label_feature='transactions.transaction_category_mapped',\n", + " description='stocks information')\n", + "\n", + "#label_feature = 'transactions-v2.transaction_category',\n", + "# Save the feature vector in the Feature Store\n", + "transactions_fv.save()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "eb69d9fa-22a9-4b9f-9443-d00d9190ad55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistancetransaction_category_mapped
0879.78000045185519044999194457298962882528879.783202445912270.0000000
1840.50000047579519156690804655296518888015801.22720242611444939.2800000
2701.43666745185519044999194910949333064003423.3111202431551621278.1266670
3621.76000045185519044999194415760195692405382.7315202411712851239.0300000
4519.76200040980889806929744412940106031926111.771920241385057407.9920000
..........................................
24994211.45363641796068600888494359198069543354302.10999792024380465790.64636418
24995211.10739147515386207333054021524999937895115.89999832024431104295.21739118
24996211.09285244050083552203244165276502284291207.089998720242101834304.01285218
24997211.61259040921157888775434328901131757235355.5899991202441811537143.96741018
24998211.58623742620471944990064017367486513464204.269999520242111623157.32623718
\n", + "

24999 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " amount_avg_1d receiver_id sender_id amount \\\n", + "0 879.780000 4518551904499919 4457298962882528 879.78 \n", + "1 840.500000 4757951915669080 4655296518888015 801.22 \n", + "2 701.436667 4518551904499919 4910949333064003 423.31 \n", + "3 621.760000 4518551904499919 4415760195692405 382.73 \n", + "4 519.762000 4098088980692974 4412940106031926 111.77 \n", + "... ... ... ... ... \n", + "24994 211.453636 4179606860088849 4359198069543354 302.10 \n", + "24995 211.107391 4751538620733305 4021524999937895 115.89 \n", + "24996 211.092852 4405008355220324 4165276502284291 207.08 \n", + "24997 211.612590 4092115788877543 4328901131757235 355.58 \n", + "24998 211.586237 4262047194499006 4017367486513464 204.26 \n", + "\n", + " transaction_id timestamp_year timestamp_month timestamp_day \\\n", + "0 3 2024 4 5 \n", + "1 7 2024 2 6 \n", + "2 11 2024 3 15 \n", + "3 15 2024 1 17 \n", + "4 19 2024 1 3 \n", + "... ... ... ... ... \n", + "24994 99979 2024 3 8 \n", + "24995 99983 2024 4 3 \n", + "24996 99987 2024 2 10 \n", + "24997 99991 2024 4 18 \n", + "24998 99995 2024 2 11 \n", + "\n", + " timestamp_hour timestamp_minute timestamp_second distance \\\n", + "0 9 12 27 0.000000 \n", + "1 11 44 49 39.280000 \n", + "2 5 16 21 278.126667 \n", + "3 12 8 51 239.030000 \n", + "4 8 50 57 407.992000 \n", + "... ... ... ... ... \n", + "24994 0 46 57 90.646364 \n", + "24995 11 0 42 95.217391 \n", + "24996 18 34 30 4.012852 \n", + "24997 11 53 7 143.967410 \n", + "24998 16 23 15 7.326237 \n", + "\n", + " transaction_category_mapped \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "24994 18 \n", + "24995 18 \n", + "24996 18 \n", + "24997 18 \n", + "24998 18 \n", + "\n", + "[24999 rows x 13 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fs\n", + "resp = transactions_fv.get_offline_features()\n", + "#Preview the dataset\n", + "fv_data = resp.to_dataframe()\n", + "fv_data" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "cb156ebe-9846-4ff3-a388-92362df7c741", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'amount_avg_1d': 211.58623655913973,\n", + " 'receiver_id': 4262047194499006,\n", + " 'sender_id': 4017367486513464,\n", + " 'amount': 204.26,\n", + " 'transaction_id': 99995,\n", + " 'timestamp_year': 2024,\n", + " 'timestamp_month': 2,\n", + " 'timestamp_day': 11,\n", + " 'timestamp_hour': 16,\n", + " 'timestamp_minute': 23,\n", + " 'timestamp_second': 15,\n", + " 'distance': 7.326236559139744}]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svc = transactions_fv.get_online_feature_service()\n", + "resp = svc.get([{\"transaction_category\": \"Pension and insurances\"}])\n", + "resp" + ] + }, + { + "cell_type": "markdown", + "id": "b5e4834e", + "metadata": {}, + "source": [ + "We update the values in the feature store with the real values of our data" + ] + }, + { + "cell_type": "markdown", + "id": "e2f6395f", + "metadata": {}, + "source": [ + "And display them after getting them from the feature store" + ] + }, + { + "cell_type": "markdown", + "id": "cf148985", + "metadata": {}, + "source": [ + "We use the feature store to calculate the distance between the average of every category and the current amount" + ] + }, + { + "cell_type": "markdown", + "id": "289eeca6", + "metadata": {}, + "source": [ + "### 4. Create model \n", + "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", + "\n", + "\n", + "\n", + "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "1cbb00b5-46bf-4a20-aad9-a03716ab97ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
00879.78000045185519044999194457298962882528879.783202445912270.000000
10840.50000047579519156690804655296518888015801.22720242611444939.280000
20701.43666745185519044999194910949333064003423.3111202431551621278.126667
30621.76000045185519044999194415760195692405382.7315202411712851239.030000
40519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
2499418211.45363641796068600888494359198069543354302.10999792024380465790.646364
2499518211.10739147515386207333054021524999937895115.89999832024431104295.217391
2499618211.09285244050083552203244165276502284291207.089998720242101834304.012852
2499718211.61259040921157888775434328901131757235355.5899991202441811537143.967410
2499818211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", + "

24999 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category_mapped amount_avg_1d receiver_id \\\n", + "0 0 879.780000 4518551904499919 \n", + "1 0 840.500000 4757951915669080 \n", + "2 0 701.436667 4518551904499919 \n", + "3 0 621.760000 4518551904499919 \n", + "4 0 519.762000 4098088980692974 \n", + "... ... ... ... \n", + "24994 18 211.453636 4179606860088849 \n", + "24995 18 211.107391 4751538620733305 \n", + "24996 18 211.092852 4405008355220324 \n", + "24997 18 211.612590 4092115788877543 \n", + "24998 18 211.586237 4262047194499006 \n", + "\n", + " sender_id amount transaction_id timestamp_year \\\n", + "0 4457298962882528 879.78 3 2024 \n", + "1 4655296518888015 801.22 7 2024 \n", + "2 4910949333064003 423.31 11 2024 \n", + "3 4415760195692405 382.73 15 2024 \n", + "4 4412940106031926 111.77 19 2024 \n", + "... ... ... ... ... \n", + "24994 4359198069543354 302.10 99979 2024 \n", + "24995 4021524999937895 115.89 99983 2024 \n", + "24996 4165276502284291 207.08 99987 2024 \n", + "24997 4328901131757235 355.58 99991 2024 \n", + "24998 4017367486513464 204.26 99995 2024 \n", + "\n", + " timestamp_month timestamp_day timestamp_hour timestamp_minute \\\n", + "0 4 5 9 12 \n", + "1 2 6 11 44 \n", + "2 3 15 5 16 \n", + "3 1 17 12 8 \n", + "4 1 3 8 50 \n", + "... ... ... ... ... \n", + "24994 3 8 0 46 \n", + "24995 4 3 11 0 \n", + "24996 2 10 18 34 \n", + "24997 4 18 11 53 \n", + "24998 2 11 16 23 \n", + "\n", + " timestamp_second distance \n", + "0 27 0.000000 \n", + "1 49 39.280000 \n", + "2 21 278.126667 \n", + "3 51 239.030000 \n", + "4 57 407.992000 \n", + "... ... ... \n", + "24994 57 90.646364 \n", + "24995 42 95.217391 \n", + "24996 30 4.012852 \n", + "24997 7 143.967410 \n", + "24998 15 7.326237 \n", + "\n", + "[24999 rows x 13 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fs\n", + "resp = transactions_fv.get_offline_features()\n", + "#Preview the dataset\n", + "fv_data = resp.to_dataframe()\n", + "\n", + "column_to_move = 'transaction_category_mapped'\n", + "\n", + "new_columns_order = [column_to_move] + [col for col in fv_data.columns if col != column_to_move]\n", + "fv_data = fv_data[new_columns_order]\n", + "\n", + "\n", + "data = fv_data\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "47512de3-60ac-49c7-ace8-031959527e86", + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", + "train_data, validation_data, test_data = np.split(\n", + " fv_data.sample(frac=1, random_state=42), [int(0.7 * len(fv_data)), int(0.9 * len(fv_data))]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f81f65b9", + "metadata": {}, + "source": [ + "We save these sets to a file." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "f849a7a9", + "metadata": {}, + "outputs": [], + "source": [ + "train_data.to_csv(\"train.csv\", index=False, header=False)\n", + "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", + "test_data.to_csv(\"test.csv\", index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "de669936", + "metadata": {}, + "source": [ + "And upload these files to our s3 bucket" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e1ca2543", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"train/train.csv\")\n", + ").upload_file(\"train.csv\")\n", + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", + ").upload_file(\"validation.csv\")\n", + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"test/test.csv\")\n", + ").upload_file(\"test.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "22de532f", + "metadata": {}, + "source": [ + "Get the XGBoost sagemaker image" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "a41b6a7d", + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" + ] + }, + { + "cell_type": "markdown", + "id": "66cae2a9", + "metadata": {}, + "source": [ + "Transform our data to a sagemaker input for training" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e51c917a", + "metadata": {}, + "outputs": [], + "source": [ + "s3_input_train = sagemaker.inputs.TrainingInput(\n", + " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", + ")\n", + "s3_input_validation = sagemaker.inputs.TrainingInput(\n", + " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6f2985d8", + "metadata": {}, + "source": [ + "We define the XGBoost model" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "92c1fe8c", + "metadata": {}, + "outputs": [], + "source": [ + "xgb = sagemaker.estimator.Estimator(\n", + " container,\n", + " role,\n", + " instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", + " sagemaker_session=sagemaker_session,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ecafdfe8", + "metadata": {}, + "source": [ + "Set the parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "582adc6c", + "metadata": {}, + "outputs": [], + "source": [ + "xgb.set_hyperparameters(\n", + " max_depth=5,\n", + " eta=0.2,\n", + " gamma=4,\n", + " min_child_weight=6,\n", + " subsample=0.8,\n", + " objective=\"multi:softprob\",\n", + " num_class=19,\n", + " verbosity=0,\n", + " num_round=100,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b36463dd", + "metadata": {}, + "source": [ + "And train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "c24e06fc", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-16-32-32-584\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-02-11 16:32:32 Starting - Starting the training job...\n", + "2024-02-11 16:32:46 Starting - Preparing the instances for training......\n", + "2024-02-11 16:33:57 Downloading - Downloading input data......\n", + "2024-02-11 16:34:37 Downloading - Downloading the training image...\n", + "2024-02-11 16:35:23 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-11 16:35:36.980 ip-10-0-69-198.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Train matrix has 17499 rows and 12 columns\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Validation matrix has 5000 rows\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.099 ip-10-0-69-198.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.100 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.101 ip-10-0-69-198.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-11:16:35:37:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.00166#011validation-merror:0.00380\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.268 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-11 16:35:37.271 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.00126#011validation-merror:0.00320\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.00103#011validation-merror:0.00260\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.00046#011validation-merror:0.00180\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.00029#011validation-merror:0.00140\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.00040#011validation-merror:0.00160\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.00017#011validation-merror:0.00100\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", + "\n", + "2024-02-11 16:35:53 Uploading - Uploading generated training model\n", + "2024-02-11 16:36:04 Completed - Training job completed\n", + "Training seconds: 127\n", + "Billable seconds: 127\n" + ] + } + ], + "source": [ + "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" + ] + }, + { + "cell_type": "markdown", + "id": "8b716cd7", + "metadata": {}, + "source": [ + "### 5. Using the endpoint \n", + "\n", + "Deploy the model to an endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xgb.model_data" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "78444d49-4ad3-49e4-a579-19b173facb26", + "metadata": {}, + "outputs": [], + "source": [ + "serving_function = project.get_function(\"serving\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "911457fa-812d-4991-a31c-4dfcb1593d3e", + "metadata": {}, + "outputs": [], + "source": [ + "serving_function_v2 = project.set_function(\n", + " func=\"src/functions/serving.py\",\n", + " name=\"serving-v2\",\n", + " kind=\"serving\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2881c17d-dd84-43d7-acc7-83e40c8110d3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_start->\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "xgboost-model\n", + "\n", + "xgboost-model\n", + "\n", + "\n", + "\n", + "->xgboost-model\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = serving_function_v2.set_topology(\n", + " \"router\",\n", + " mlrun.serving.routers.EnrichmentModelRouter(\n", + " feature_vector_uri=transactions_fv.uri,\n", + " impute_policy={\"*\": \"$mean\"}),\n", + ")\n", + "serving_function_v2.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=xgb.model_data)\n", + "\n", + "# Plot the ensemble configuration\n", + "serving_function_v2.spec.graph.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "0ab0bcd2-5c70-4f48-bff9-d060f027e8e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 16:36:45,242 [info] model xgboost-model was loaded\n", + "> 2024-02-11 16:36:45,243 [info] Loaded ['xgboost-model']\n" + ] + } + ], + "source": [ + "server = serving_function_v2.to_mock_server()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "dd57cfcd-5878-4775-83ee-422dc2261ce8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'inputs': [[211.58623655913973, 4262047194499006, 4017367486513464, 204.26, 99995, 2024, 2, 11, 16, 23, 15, 7.326236559139744]]}\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': '9fca777838b34ecaa9a0978beb4c3324',\n", + " 'model_name': 'xgboost-model',\n", + " 'outputs': [[0.0006098453304730356,\n", + " 0.000491024402435869,\n", + " 0.0005141795263625681,\n", + " 0.0007783450419083238,\n", + " 0.0007057395414449275,\n", + " 0.0006167895044200122,\n", + " 0.0008293685968965292,\n", + " 0.0007642377750016749,\n", + " 0.0004749966901727021,\n", + " 0.0009146890370175242,\n", + " 0.0023798206821084023,\n", + " 0.0007584734121337533,\n", + " 0.0005588593194261193,\n", + " 0.0018726944690570235,\n", + " 0.001147600938566029,\n", + " 0.0010383735643699765,\n", + " 0.0010812224354594946,\n", + " 0.006694969721138477,\n", + " 0.9777687788009644]]}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = server.test(body={'inputs':[['Pension and insurances']]})\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 16:39:18,386 [info] Starting remote function deploy\n", + "2024-02-11 16:39:18 (info) Deploying function\n", + "2024-02-11 16:39:18 (info) Building\n", + "2024-02-11 16:39:19 (info) Staging files and preparing base images\n", + "2024-02-11 16:39:19 (info) Building processor image\n", + "2024-02-11 16:40:24 (info) Build complete\n", + "2024-02-11 16:40:33 (info) Function deploy complete\n", + "> 2024-02-11 16:40:40,471 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-v3-admin-serving-v2.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-v3-admin-serving-v2'})" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.deploy_function(\"serving-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "ac19dc03-01e2-4e29-ba75-a34804833d5c", + "metadata": {}, + "outputs": [], + "source": [ + "serving_function_v2 = project.get_function(\"serving-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 16:40:40,546 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//v2/models/xgboost-model/predict'}\n" + ] + } + ], + "source": [ + "response = serving_function_v2.invoke(path='/v2/models/xgboost-model/predict', body={\"inputs\": [['Pension and insurances']]})" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "57eeaddc-654a-41d2-bb51-4a9a787a3311", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '641c5971-c881-4d56-a326-8b02900be8db',\n", + " 'model_name': 'xgboost-model',\n", + " 'outputs': [[0.0006098453304730356,\n", + " 0.000491024402435869,\n", + " 0.0005141795263625681,\n", + " 0.0007783450419083238,\n", + " 0.0007057395414449275,\n", + " 0.0006167895044200122,\n", + " 0.0008293685968965292,\n", + " 0.0007642377750016749,\n", + " 0.0004749966901727021,\n", + " 0.0009146890370175242,\n", + " 0.0023798206821084023,\n", + " 0.0007584734121337533,\n", + " 0.0005588593194261193,\n", + " 0.0018726944690570235,\n", + " 0.001147600938566029,\n", + " 0.0010383735643699765,\n", + " 0.0010812224354594946,\n", + " 0.006694969721138477,\n", + " 0.9777687788009644]]}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response" + ] + }, + { + "cell_type": "markdown", + "id": "712f4d35", + "metadata": {}, + "source": [ + "### 6. Evaluate performance \n", + "\n", + "Run the model on our test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ff008b-f4e4-491b-b1e8-3b0a652c35fc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "2e863ea7-5804-4637-b677-390c305cabfe", + "metadata": {}, + "outputs": [], + "source": [ + "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" + ] + }, + { + "cell_type": "markdown", + "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", + "metadata": {}, + "source": [ + "Add the evaluation function to our project" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "ca4f7e49", + "metadata": {}, + "outputs": [], + "source": [ + "evaluate_function = project.get_function(\"evaluate\")" + ] + }, + { + "cell_type": "markdown", + "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", + "metadata": {}, + "source": [ + "Run the evaluation job" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 16:40:40,769 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '77fb208c0816491e9f7ae69634b31b1b', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", + "> 2024-02-11 16:40:41,204 [info] Job is running in the background, pod: evaluate-evaluate-x5mzk\n", + "[16:40:45] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", + "configuration generated by an older version of XGBoost, please export the model by calling\n", + "`Booster.save_model` from that version first, then load it back in current version. See:\n", + "\n", + " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", + "\n", + "for more details about differences between saving model and serializing.\n", + "\n", + "> 2024-02-11 16:40:46,110 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin', 'logs_cmd': 'mlrun logs 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin'}\n", + "> 2024-02-11 16:40:46,110 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-v3-admin/jobs/monitor/77fb208c0816491e9f7ae69634b31b1b/overview'}\n", + "> 2024-02-11 16:40:46,110 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-v3-admin0Feb 11 16:40:44completedevaluate-evaluate
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-x5mzk
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category_mapped
factorize_key={'Uncategorized': 0, 'Entertainment': 1, 'Education': 2, 'Shopping': 3, 'Personal Care': 4, 'Health and Fitness': 5, 'Food and Dining': 6, 'Gifts and Donations': 7, 'Investments': 8, 'Bills and Utilities': 9, 'Auto and Transport': 10, 'Travel': 11, 'Fees and Charges': 12, 'Business Services': 13, 'Personal Services': 14, 'Taxes': 15, 'Gambling': 16, 'Home': 17, 'Pension and insurances': 18}
classification_report
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-11 16:40:54,323 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + ] + } + ], + "source": [ + "evaluate_run = evaluate_function.run(\n", + " handler=\"evaluate\",\n", + " params={\n", + " \"model_path\": xgb.model_data,\n", + " \"model_name\": \"xgboost-model\",\n", + " \"test_set\": s3_data,\n", + " \"label_column\": \"transaction_category_mapped\",\n", + " \"factorize_key\": factorize_key,\n", + " },\n", + " returns=[\"classification_report: dataset\"])" + ] + }, + { + "cell_type": "markdown", + "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", + "metadata": {}, + "source": [ + "See the evaluation result" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precisionrecallf1-scoresupport
Uncategorized1.0000001.0000001.0000008.0000
Entertainment1.0000001.0000001.000000362.0000
Education1.0000000.9285710.96296314.0000
Shopping0.9988391.0000000.999419860.0000
Personal Care1.0000001.0000001.00000023.0000
Health and Fitness1.0000001.0000001.000000123.0000
Food and Dining1.0000001.0000001.000000217.0000
Gifts and Donations1.0000001.0000001.00000068.0000
Investments0.9166671.0000000.95652222.0000
Bills and Utilities1.0000001.0000001.00000092.0000
Auto and Transport1.0000001.0000001.000000475.0000
Travel1.0000001.0000001.00000036.0000
Fees and Charges1.0000001.0000001.00000029.0000
Business Services1.0000001.0000001.00000047.0000
Personal Services1.0000001.0000001.00000024.0000
Taxes1.0000000.8461540.91666713.0000
Gambling1.0000001.0000001.0000008.0000
Home1.0000001.0000001.00000052.0000
Pension and insurances1.0000001.0000001.00000027.0000
accuracy0.9988000.9988000.9988000.9988
macro avg0.9955530.9881430.9913462500.0000
weighted avg0.9988670.9988000.9987772500.0000
\n", + "
" + ], + "text/plain": [ + " precision recall f1-score support\n", + "Uncategorized 1.000000 1.000000 1.000000 8.0000\n", + "Entertainment 1.000000 1.000000 1.000000 362.0000\n", + "Education 1.000000 0.928571 0.962963 14.0000\n", + "Shopping 0.998839 1.000000 0.999419 860.0000\n", + "Personal Care 1.000000 1.000000 1.000000 23.0000\n", + "Health and Fitness 1.000000 1.000000 1.000000 123.0000\n", + "Food and Dining 1.000000 1.000000 1.000000 217.0000\n", + "Gifts and Donations 1.000000 1.000000 1.000000 68.0000\n", + "Investments 0.916667 1.000000 0.956522 22.0000\n", + "Bills and Utilities 1.000000 1.000000 1.000000 92.0000\n", + "Auto and Transport 1.000000 1.000000 1.000000 475.0000\n", + "Travel 1.000000 1.000000 1.000000 36.0000\n", + "Fees and Charges 1.000000 1.000000 1.000000 29.0000\n", + "Business Services 1.000000 1.000000 1.000000 47.0000\n", + "Personal Services 1.000000 1.000000 1.000000 24.0000\n", + "Taxes 1.000000 0.846154 0.916667 13.0000\n", + "Gambling 1.000000 1.000000 1.000000 8.0000\n", + "Home 1.000000 1.000000 1.000000 52.0000\n", + "Pension and insurances 1.000000 1.000000 1.000000 27.0000\n", + "accuracy 0.998800 0.998800 0.998800 0.9988\n", + "macro avg 0.995553 0.988143 0.991346 2500.0000\n", + "weighted avg 0.998867 0.998800 0.998777 2500.0000" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate_run.artifact(\"classification_report\").as_df()" + ] + }, + { + "cell_type": "markdown", + "id": "98d0b67e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You should see results similar to this:\n", + "\n", + "```\n", + " precision recall f1-score support\n", + "\n", + " Uncategorized 1.00 0.92 0.96 51\n", + " Entertainment 0.81 0.89 0.85 1486\n", + " Education 1.00 0.94 0.97 80\n", + " Shopping 0.86 0.94 0.90 3441\n", + " Personal Care 1.00 0.98 0.99 132\n", + " Health and Fitness 0.99 0.89 0.94 443\n", + " Food and Dining 0.99 0.82 0.90 918\n", + " Gifts and Donations 1.00 0.95 0.97 275\n", + " Investments 0.99 0.97 0.98 88\n", + " Bills and Utilities 1.00 0.99 1.00 332\n", + " Auto and Transport 0.94 0.84 0.88 1967\n", + " Travel 0.96 0.84 0.90 120\n", + " Fees and Charges 1.00 0.94 0.97 106\n", + " Business Services 1.00 0.99 1.00 146\n", + " Personal Services 1.00 0.96 0.98 75\n", + " Taxes 0.98 0.94 0.96 47\n", + " Gambling 1.00 1.00 1.00 15\n", + " Home 0.98 0.89 0.93 168\n", + "Pension and insurances 0.99 1.00 1.00 110\n", + "\n", + " accuracy 0.90 10000\n", + " macro avg 0.97 0.93 0.95 10000\n", + " weighted avg 0.91 0.90 0.90 10000\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "49fdc82d", + "metadata": {}, + "source": [ + "### 7. Clean up \n", + "\n", + "Remove the feature group and endpoint to clean up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f79b1164", + "metadata": {}, + "outputs": [], + "source": [ + "#feature_group.delete()\n", + "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e04b6fa6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Notebook CI Test Results\n", + "\n", + "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", + "\n", + "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "smdemo", + "language": "python", + "name": "smdemo" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 95b83d61bb821594acfb76b6ee9dc4101d9dcb92 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 12 Feb 2024 13:38:48 +0000 Subject: [PATCH 12/16] upadting notebooks names --- financial_payment_classification.ipynb | 2012 ++++++---- financial_payment_classification_v3.ipynb | 3564 ----------------- ..._payment_classification_with_serving.ipynb | 2137 ++++++++++ src/functions/evaluate.py | 4 +- 4 files changed, 3431 insertions(+), 4286 deletions(-) delete mode 100644 financial_payment_classification_v3.ipynb create mode 100644 financial_payment_classification_with_serving.ipynb diff --git a/financial_payment_classification.ipynb b/financial_payment_classification.ipynb index b0d8037..b034e1c 100644 --- a/financial_payment_classification.ipynb +++ b/financial_payment_classification.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "01b5c703", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6498f087", "metadata": {}, @@ -23,6 +25,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c2e49281", "metadata": {}, @@ -67,63 +70,29 @@ "cell_type": "code", "execution_count": 1, "id": "fff19d6b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import sys\n", - "!{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", - "!{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", - "!{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:10:06,832 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" + "\u001b[33mWARNING: Ignoring invalid distribution -yyaml (/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -yyaml (/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -yyaml (/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -yyaml (/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m" ] } ], "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" + "import sys\n", + "\n", + "!{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", + "!{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1b17a94d", "metadata": {}, @@ -133,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "42c5d6d0", "metadata": {}, "outputs": [ @@ -142,7 +111,7 @@ "output_type": "stream", "text": [ "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /User/.config/sagemaker/config.yaml\n" + "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n" ] } ], @@ -152,36 +121,19 @@ "import sagemaker\n", "import time\n", "import os\n", + "\n", "from time import sleep\n", "from sklearn.metrics import classification_report\n", "from sagemaker.feature_store.feature_group import FeatureGroup\n", + "\n", "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", - "metadata": {}, - "outputs": [], - "source": [ - "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", - "metadata": {}, - "outputs": [], - "source": [ - "sess = sagemaker.Session()\n", - "write_bucket = sess.default_bucket()\n", - "write_prefix = \"sagemaker-app-lab\"" + "import numpy as np\n", + "\n", + "import mlrun" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3af7c33d", "metadata": {}, @@ -191,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "c0e4db17", "metadata": {}, "outputs": [], @@ -200,13 +152,13 @@ "sm_client = boto3.client(\"sagemaker\")\n", "boto_session = boto3.Session(region_name=region)\n", "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", - "#role = sagemaker.get_execution_role()\n", - "role = sagemaker_role\n", + "role = os.environ[\"SAGEMAKER-ROLE\"]\n", "bucket_prefix = \"payment-classification\"\n", "s3_bucket = sagemaker_session.default_bucket()" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4fe6a975", "metadata": {}, @@ -216,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "43946b9f", "metadata": {}, "outputs": [], @@ -241,10 +193,13 @@ " \"Gambling\": 16,\n", " \"Home\": 17,\n", " \"Pension and insurances\": 18,\n", - "}" + "}\n", + "\n", + "factorize_key = {key: str(value) for key, value in factorize_key.items()}" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5e3dc3c4", "metadata": {}, @@ -256,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "5ff0d280", "metadata": {}, "outputs": [], @@ -270,6 +225,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "08578d93", "metadata": {}, @@ -279,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "a477abd7", "metadata": {}, "outputs": [], @@ -293,6 +249,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cf6be447", "metadata": {}, @@ -314,6 +271,161 @@ ] }, { + "cell_type": "code", + "execution_count": 7, + "id": "558fa01c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_categoryreceiver_idsender_idamounttimestamp
47177Shopping4968649149775898460907230482834273.832021-04-01 22:43:20
41438Shopping40628421481442334878939591031620119.592021-04-04 18:44:32
44521Shopping44626242205168194602386441721138123.812021-04-04 16:38:48
8630Entertainment4770991682847140406170368314742275.002021-02-14 19:53:54
48998Shopping4419770343842824452419728778121879.452021-01-12 23:27:43
30235Shopping4928680229033121453813232654946043.072021-04-12 11:03:14
50088Personal Care4827290622904985475106951279062537.772021-01-14 18:20:44
36649Shopping47216869630108364939040219765629131.132021-04-24 16:29:56
32358Shopping48092052138072744544332332005588110.792021-04-26 07:20:06
58131Food and Dining42365609875785574629183456621801216.962021-01-15 17:44:36
\n", + "
" + ], + "text/plain": [ + " transaction_category receiver_id sender_id amount \\\n", + "47177 Shopping 4968649149775898 4609072304828342 73.83 \n", + "41438 Shopping 4062842148144233 4878939591031620 119.59 \n", + "44521 Shopping 4462624220516819 4602386441721138 123.81 \n", + "8630 Entertainment 4770991682847140 4061703683147422 75.00 \n", + "48998 Shopping 4419770343842824 4524197287781218 79.45 \n", + "30235 Shopping 4928680229033121 4538132326549460 43.07 \n", + "50088 Personal Care 4827290622904985 4751069512790625 37.77 \n", + "36649 Shopping 4721686963010836 4939040219765629 131.13 \n", + "32358 Shopping 4809205213807274 4544332332005588 110.79 \n", + "58131 Food and Dining 4236560987578557 4629183456621801 216.96 \n", + "\n", + " timestamp \n", + "47177 2021-04-01 22:43:20 \n", + "41438 2021-04-04 18:44:32 \n", + "44521 2021-04-04 16:38:48 \n", + "8630 2021-02-14 19:53:54 \n", + "48998 2021-01-12 23:27:43 \n", + "30235 2021-04-12 11:03:14 \n", + "50088 2021-01-14 18:20:44 \n", + "36649 2021-04-24 16:29:56 \n", + "32358 2021-04-26 07:20:06 \n", + "58131 2021-01-15 17:44:36 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.sample(10)" + ] + }, + { + "attachments": {}, "cell_type": "markdown", "id": "b5492919", "metadata": {}, @@ -323,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "id": "24f6090e", "metadata": {}, "outputs": [], @@ -338,35 +450,132 @@ "del data[\"timestamp\"]" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f7314f8a", + "metadata": {}, + "source": [ + "We'll transform the transaction categories to numeric targets for the classification by factorization." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ea2ebdd5", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9b9ed583", + "metadata": {}, + "source": [ + "### 3. Create feature store \n", + "\n", + "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "233b862a", + "metadata": {}, + "source": [ + "Before creating the feature store itself we need to set a name for the feature group and identifier used" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9df53a9c", + "metadata": {}, + "outputs": [], + "source": [ + "feature_group_name = \"feature-group-payment-classification\"\n", + "record_identifier_feature_name = \"identifier\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8d9b663f", + "metadata": {}, + "source": [ + "With the name we defined we create the feature group, runtime and session" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1aef7f91", + "metadata": {}, + "outputs": [], + "source": [ + "feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)\n", + "\n", + "featurestore_runtime = boto_session.client(\n", + " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n", + ")\n", + "\n", + "feature_store_session = sagemaker.Session(\n", + " boto_session=boto_session,\n", + " sagemaker_client=sm_client,\n", + " sagemaker_featurestore_runtime_client=featurestore_runtime,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3f3d69f5", + "metadata": {}, + "source": [ + "Once we have defined our feature store we need to put some data in it. We create a Pandas dataframe with the columns mean_amount, count, identifier and event time to store in the feature store" + ] + }, { "cell_type": "code", "execution_count": 12, - "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "id": "a1a250da", "metadata": {}, "outputs": [], "source": [ - "for key, val in factorize_key.items():\n", - " factorize_key[key] = str(val)" + "columns = [\"mean_amount\", \"count\", \"identifier\", \"EventTime\"]\n", + "feature_store_data = pd.DataFrame(columns=columns, dtype=object)\n", + "\n", + "feature_store_data[\"identifier\"] = range(19)\n", + "feature_store_data[\"mean_amount\"] = 0.0\n", + "feature_store_data[\"count\"] = 1\n", + "feature_store_data[\"EventTime\"] = time.time()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "dea2565b", + "metadata": {}, + "source": [ + "Using the created dataframe we set the feature definitions" ] }, { "cell_type": "code", "execution_count": 13, - "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", + "id": "292571c7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "\n", - "[ 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances']\n", - "Length: 19, dtype: string" + "[FeatureDefinition(feature_name='mean_amount', feature_type=, collection_type=None),\n", + " FeatureDefinition(feature_name='count', feature_type=, collection_type=None),\n", + " FeatureDefinition(feature_name='identifier', feature_type=, collection_type=None),\n", + " FeatureDefinition(feature_name='EventTime', feature_type=, collection_type=None)]" ] }, "execution_count": 13, @@ -375,145 +584,106 @@ } ], "source": [ - "data[\"transaction_category\"].unique()" + "feature_group.load_feature_definitions(data_frame=feature_store_data)" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "f7314f8a", + "id": "2a845e07", "metadata": {}, "source": [ - "We'll transform the transaction categories to numeric targets for the classification by factorization." + "With these definitions ready we can create the feature group itself" ] }, { "cell_type": "code", "execution_count": 14, - "id": "ea2ebdd5", + "id": "d046eeb4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-2:934638699319:feature-group/feature-group-payment-classification',\n", + " 'ResponseMetadata': {'RequestId': '32eefc0a-1d32-4ce8-9801-715d94cff910',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amzn-requestid': '32eefc0a-1d32-4ce8-9801-715d94cff910',\n", + " 'content-type': 'application/x-amz-json-1.1',\n", + " 'content-length': '113',\n", + " 'date': 'Mon, 12 Feb 2024 13:21:13 GMT'},\n", + " 'RetryAttempts': 0}}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + "feature_group.create(\n", + " s3_uri=f\"s3://{s3_bucket}/{bucket_prefix}\",\n", + " record_identifier_name=record_identifier_feature_name,\n", + " event_time_feature_name=\"EventTime\",\n", + " role_arn=role,\n", + " enable_online_store=True,\n", + ")" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", - "metadata": { - "tags": [] - }, + "id": "95ad73b1", + "metadata": {}, "source": [ - "### 3. Create feature store \n", - "\n", - "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " + "It takes a couple of minutes for the feature group to be created, we need to wait for this to be done before trying to ingest data in the feature store" ] }, { - "cell_type": "markdown", - "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", + "cell_type": "code", + "execution_count": 15, + "id": "530865ec", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for Feature Group to be Created\n", + "Waiting for Feature Group to be Created\n", + "Waiting for Feature Group to be Created\n", + "FeatureGroup feature-group-payment-classification successfully created.\n" + ] + } + ], "source": [ - "#### feature-group-payment-classification" + "status = feature_group.describe().get(\"FeatureGroupStatus\")\n", + "while status == \"Creating\":\n", + " print(\"Waiting for Feature Group to be Created\")\n", + " time.sleep(5)\n", + " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", + "print(f\"FeatureGroup {feature_group.name} successfully created.\")" ] }, { - "cell_type": "code", - "execution_count": 15, - "id": "3c621044-681a-4e1a-9968-f637ed992539", + "attachments": {}, + "cell_type": "markdown", + "id": "3df88321", "metadata": {}, - "outputs": [], "source": [ - "def add_grouped_features(df):\n", - " feature_store_data = pd.DataFrame()\n", - " feature_store_data[\"mean_amount\"] = df.groupby([\"transaction_category\"]).mean()[\"amount\"]\n", - " feature_store_data[\"count\"] = df.groupby([\"transaction_category\"]).count()[\"amount\"]\n", - " feature_store_data[\"identifier\"] = feature_store_data.index\n", - " feature_store_data[\"EventTime\"] = time.time()\n", - " \n", - " \n", - " \n", - " additional_features = pd.pivot_table(\n", - " feature_store_data, values=[\"mean_amount\"], index=[\"identifier\"]\n", - " ).T.add_prefix(\"dist_\")\n", - " additional_features_columns = list(additional_features.columns)\n", - " df2 = df.copy()\n", - " df2 = pd.concat([df2, pd.DataFrame(columns=additional_features_columns, dtype=object)])\n", - " df2[additional_features_columns] = additional_features.values[0]\n", - " for col in additional_features_columns:\n", - " df2[col] = abs(df2[col] - df2[\"amount\"]) \n", - " df2['transaction_id']= df2.reset_index().index \n", - " return df2" + "Once the feature group is created we can ingest data into it" ] }, { "cell_type": "code", "execution_count": 16, - "id": "c71af4a9-f2d8-40ca-b0bf-3ef67c5b69d9", + "id": "8168ebd5", "metadata": {}, "outputs": [ { "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "add_grouped_features\n", - "\n", - "add_grouped_features\n", - "\n", - "\n", - "\n", - "_start->add_grouped_features\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet/parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "add_grouped_features->parquet/parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql/nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "add_grouped_features->nosql/nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], "text/plain": [ - "" + "IngestionManagerPandas(feature_group_name='feature-group-payment-classification', sagemaker_fs_runtime_client_config=, sagemaker_session=, max_workers=3, max_processes=1, profile_name=None, _async_result=, _processing_pool=, _failed_indices=[])" ] }, "execution_count": 16, @@ -522,41 +692,329 @@ } ], "source": [ - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import ParquetTarget\n", + "feature_group.ingest(data_frame=feature_store_data, max_workers=3, wait=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "485d1906", + "metadata": {}, + "source": [ + "To retrieve data from our feature store we define a function that gets the current values from the feature store" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5f36a576", + "metadata": {}, + "outputs": [], + "source": [ + "def get_feature_store_values():\n", + " response = featurestore_runtime.batch_get_record(\n", + " Identifiers=[\n", + " {\n", + " \"FeatureGroupName\": feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [str(i) for i in range(19)],\n", + " }\n", + " ]\n", + " )\n", "\n", - "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", - " entities=[fstore.Entity(\"transaction_id\")],\n", - " engine=\"pandas\",\n", - " description=\"transactions feature set\")\n", + " columns = [\"mean_amount\", \"count\", \"identifier\", \"EventTime\"]\n", "\n", - "# setting up the graph\n", - "extended_transactions_set.graph \\\n", - " .to(name='add_grouped_features', handler='add_grouped_features')\n", - " # Add aggregations for 2, 12, and 24 hour time windows\n", - " \n", - " \n", + " feature_store_resp = pd.DataFrame(\n", + " data=[\n", + " [resp[\"Record\"][i][\"ValueAsString\"] for i in range(len(columns))]\n", + " for resp in response[\"Records\"]\n", + " ],\n", + " columns=columns,\n", + " )\n", + " feature_store_resp[\"identifier\"] = feature_store_resp[\"identifier\"].astype(int)\n", + " feature_store_resp[\"count\"] = feature_store_resp[\"count\"].astype(int)\n", + " feature_store_resp[\"mean_amount\"] = feature_store_resp[\"mean_amount\"].astype(float)\n", + " feature_store_resp[\"EventTime\"] = feature_store_resp[\"EventTime\"].astype(float)\n", + " feature_store_resp = feature_store_resp.sort_values(by=\"identifier\")\n", "\n", + " return feature_store_resp\n", "\n", - "extended_transactions_set.set_targets()\n", "\n", - "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" + "feature_store_resp = get_feature_store_values()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b5e4834e", + "metadata": {}, + "source": [ + "We update the values in the feature store with the real values of our data" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", + "execution_count": 18, + "id": "cb025e68", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:11:30,483 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" - ] - }, + "data": { + "text/plain": [ + "IngestionManagerPandas(feature_group_name='feature-group-payment-classification', sagemaker_fs_runtime_client_config=, sagemaker_session=, max_workers=3, max_processes=1, profile_name=None, _async_result=, _processing_pool=, _failed_indices=[])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_store_data = pd.DataFrame()\n", + "feature_store_data[\"mean_amount\"] = data.groupby([\"transaction_category\"]).mean()[\"amount\"]\n", + "feature_store_data[\"count\"] = data.groupby([\"transaction_category\"]).count()[\"amount\"]\n", + "feature_store_data[\"identifier\"] = feature_store_data.index\n", + "feature_store_data[\"EventTime\"] = time.time()\n", + "\n", + "feature_store_data[\"mean_amount\"] = (\n", + " pd.concat([feature_store_resp, feature_store_data])\n", + " .groupby(\"identifier\")\n", + " .apply(lambda x: np.average(x[\"mean_amount\"], weights=x[\"count\"]))\n", + ")\n", + "feature_store_data[\"count\"] = (\n", + " pd.concat([feature_store_resp, feature_store_data]).groupby(\"identifier\").sum()[\"count\"]\n", + ")\n", + "\n", + "feature_group.ingest(data_frame=feature_store_data, max_workers=3, wait=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e2f6395f", + "metadata": {}, + "source": [ + "And display them after getting them from the feature store" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "10b23bf6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_amountcountidentifierEventTime
2495.83735546501.707744e+09
1551.2093821451211.707744e+09
11851.15349574421.707744e+09
14100.9175033395331.707744e+09
531.504036120941.707744e+09
8119.596405483751.707744e+09
493.249393967461.707744e+09
1651.072447279171.707744e+09
176024.54748493081.707744e+09
10114.779558334991.707744e+09
7100.97711019349101.707744e+09
9351.8075881302111.707744e+09
026.950559930121.707744e+09
3204.7391971395131.707744e+09
18496.766086930141.707744e+09
122913.673247465151.707744e+09
13375.228871186161.707744e+09
6781.3921991860171.707744e+09
1205.4571511116181.707744e+09
\n", + "
" + ], + "text/plain": [ + " mean_amount count identifier EventTime\n", + "2 495.837355 465 0 1.707744e+09\n", + "15 51.209382 14512 1 1.707744e+09\n", + "11 851.153495 744 2 1.707744e+09\n", + "14 100.917503 33953 3 1.707744e+09\n", + "5 31.504036 1209 4 1.707744e+09\n", + "8 119.596405 4837 5 1.707744e+09\n", + "4 93.249393 9674 6 1.707744e+09\n", + "16 51.072447 2791 7 1.707744e+09\n", + "17 6024.547484 930 8 1.707744e+09\n", + "10 114.779558 3349 9 1.707744e+09\n", + "7 100.977110 19349 10 1.707744e+09\n", + "9 351.807588 1302 11 1.707744e+09\n", + "0 26.950559 930 12 1.707744e+09\n", + "3 204.739197 1395 13 1.707744e+09\n", + "18 496.766086 930 14 1.707744e+09\n", + "12 2913.673247 465 15 1.707744e+09\n", + "13 375.228871 186 16 1.707744e+09\n", + "6 781.392199 1860 17 1.707744e+09\n", + "1 205.457151 1116 18 1.707744e+09" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_store_data = get_feature_store_values()\n", + "feature_store_data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cf148985", + "metadata": {}, + "source": [ + "We use the feature store to calculate the distance between the average of every category and the current amount" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4a3e85de", + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -589,16 +1047,16 @@ " minute\n", " second\n", " ...\n", - " dist_18\n", - " dist_2\n", - " dist_3\n", - " dist_4\n", - " dist_5\n", - " dist_6\n", - " dist_7\n", - " dist_8\n", - " dist_9\n", - " transaction_id\n", + " 9_dist\n", + " 10_dist\n", + " 11_dist\n", + " 12_dist\n", + " 13_dist\n", + " 14_dist\n", + " 15_dist\n", + " 16_dist\n", + " 17_dist\n", + " 18_dist\n", " \n", " \n", " \n", @@ -615,16 +1073,16 @@ " 57.0\n", " 42.0\n", " ...\n", - " 627.802849\n", - " 17.893495\n", - " 732.342497\n", - " 801.755964\n", - " 713.663595\n", - " 740.010607\n", - " 782.187553\n", - " 5191.287484\n", " 718.480442\n", - " 0\n", + " 732.28289\n", + " 481.452412\n", + " 806.309441\n", + " 628.520803\n", + " 336.493914\n", + " 2080.413247\n", + " 458.031129\n", + " 51.867801\n", + " 627.802849\n", " \n", " \n", " 1\n", @@ -639,16 +1097,16 @@ " 53.0\n", " 32.0\n", " ...\n", - " 391.172849\n", - " 254.523495\n", - " 495.712497\n", - " 565.125964\n", - " 477.033595\n", - " 503.380607\n", - " 545.557553\n", - " 5427.917484\n", " 481.850442\n", - " 1\n", + " 495.65289\n", + " 244.822412\n", + " 569.679441\n", + " 391.890803\n", + " 99.863914\n", + " 2317.043247\n", + " 221.401129\n", + " 184.762199\n", + " 391.172849\n", " \n", " \n", " 2\n", @@ -663,16 +1121,16 @@ " 29.0\n", " 32.0\n", " ...\n", - " 28.697151\n", - " 674.393495\n", - " 75.842497\n", - " 145.255964\n", - " 57.163595\n", - " 83.510607\n", - " 125.687553\n", - " 5847.787484\n", " 61.980442\n", - " 2\n", + " 75.78289\n", + " 175.047588\n", + " 149.809441\n", + " 27.979197\n", + " 320.006086\n", + " 2736.913247\n", + " 198.468871\n", + " 604.632199\n", + " 28.697151\n", " \n", " \n", " 3\n", @@ -687,16 +1145,16 @@ " 14.0\n", " 19.0\n", " ...\n", - " 674.322849\n", - " 28.626505\n", - " 778.862497\n", - " 848.275964\n", - " 760.183595\n", - " 786.530607\n", - " 828.707553\n", - " 5144.767484\n", " 765.000442\n", - " 3\n", + " 778.80289\n", + " 527.972412\n", + " 852.829441\n", + " 675.040803\n", + " 383.013914\n", + " 2033.893247\n", + " 504.551129\n", + " 98.387801\n", + " 674.322849\n", " \n", " \n", " 4\n", @@ -711,16 +1169,16 @@ " 50.0\n", " 16.0\n", " ...\n", - " 536.792849\n", - " 108.903495\n", - " 641.332497\n", - " 710.745964\n", - " 622.653595\n", - " 649.000607\n", - " 691.177553\n", - " 5282.297484\n", " 627.470442\n", - " 4\n", + " 641.27289\n", + " 390.442412\n", + " 715.299441\n", + " 537.510803\n", + " 245.483914\n", + " 2171.423247\n", + " 367.021129\n", + " 39.142199\n", + " 536.792849\n", " \n", " \n", " ...\n", @@ -759,16 +1217,16 @@ " 23.0\n", " 53.0\n", " ...\n", - " 0.027151\n", - " 645.723495\n", - " 104.512497\n", - " 173.925964\n", - " 85.833595\n", - " 112.180607\n", - " 154.357553\n", - " 5819.117484\n", " 90.650442\n", - " 99992\n", + " 104.45289\n", + " 146.377588\n", + " 178.479441\n", + " 0.690803\n", + " 291.336086\n", + " 2708.243247\n", + " 169.798871\n", + " 575.962199\n", + " 0.027151\n", " \n", " \n", " 99993\n", @@ -783,16 +1241,16 @@ " 30.0\n", " 18.0\n", " ...\n", - " 53.967151\n", - " 699.663495\n", - " 50.572497\n", - " 119.985964\n", - " 31.893595\n", - " 58.240607\n", - " 100.417553\n", - " 5873.057484\n", " 36.710442\n", - " 99993\n", + " 50.51289\n", + " 200.317588\n", + " 124.539441\n", + " 53.249197\n", + " 345.276086\n", + " 2762.183247\n", + " 223.738871\n", + " 629.902199\n", + " 53.967151\n", " \n", " \n", " 99994\n", @@ -807,16 +1265,16 @@ " 51.0\n", " 10.0\n", " ...\n", - " 17.177151\n", - " 662.873495\n", - " 87.362497\n", - " 156.775964\n", - " 68.683595\n", - " 95.030607\n", - " 137.207553\n", - " 5836.267484\n", " 73.500442\n", - " 99994\n", + " 87.30289\n", + " 163.527588\n", + " 161.329441\n", + " 16.459197\n", + " 308.486086\n", + " 2725.393247\n", + " 186.948871\n", + " 593.112199\n", + " 17.177151\n", " \n", " \n", " 99995\n", @@ -831,16 +1289,16 @@ " 25.0\n", " 7.0\n", " ...\n", - " 1.197151\n", - " 646.893495\n", - " 103.342497\n", - " 172.755964\n", - " 84.663595\n", - " 111.010607\n", - " 153.187553\n", - " 5820.287484\n", " 89.480442\n", - " 99995\n", + " 103.28289\n", + " 147.547588\n", + " 177.309441\n", + " 0.479197\n", + " 292.506086\n", + " 2709.413247\n", + " 170.968871\n", + " 577.132199\n", + " 1.197151\n", " \n", " \n", " 99996\n", @@ -855,20 +1313,20 @@ " 42.0\n", " 0.0\n", " ...\n", - " 2.462849\n", - " 643.233495\n", - " 107.002497\n", - " 176.415964\n", - " 88.323595\n", - " 114.670607\n", - " 156.847553\n", - " 5816.627484\n", " 93.140442\n", - " 99996\n", + " 106.94289\n", + " 143.887588\n", + " 180.969441\n", + " 3.180803\n", + " 288.846086\n", + " 2705.753247\n", + " 167.308871\n", + " 573.472199\n", + " 2.462849\n", " \n", " \n", "\n", - "

99997 rows × 30 columns

\n", + "

99997 rows × 29 columns

\n", "" ], "text/plain": [ @@ -885,84 +1343,68 @@ "99995 18 4.262047e+15 4.017367e+15 204.26 2021.0 2.0 \n", "99996 18 4.627517e+15 4.250421e+15 207.92 2021.0 4.0 \n", "\n", - " day hour minute second ... dist_18 dist_2 dist_3 \\\n", - "0 10.0 19.0 57.0 42.0 ... 627.802849 17.893495 732.342497 \n", - "1 11.0 17.0 53.0 32.0 ... 391.172849 254.523495 495.712497 \n", - "2 21.0 18.0 29.0 32.0 ... 28.697151 674.393495 75.842497 \n", - "3 9.0 16.0 14.0 19.0 ... 674.322849 28.626505 778.862497 \n", - "4 4.0 15.0 50.0 16.0 ... 536.792849 108.903495 641.332497 \n", - "... ... ... ... ... ... ... ... ... \n", - "99992 20.0 12.0 23.0 53.0 ... 0.027151 645.723495 104.512497 \n", - "99993 24.0 19.0 30.0 18.0 ... 53.967151 699.663495 50.572497 \n", - "99994 8.0 19.0 51.0 10.0 ... 17.177151 662.873495 87.362497 \n", - "99995 14.0 23.0 25.0 7.0 ... 1.197151 646.893495 103.342497 \n", - "99996 14.0 0.0 42.0 0.0 ... 2.462849 643.233495 107.002497 \n", + " day hour minute second ... 9_dist 10_dist 11_dist \\\n", + "0 10.0 19.0 57.0 42.0 ... 718.480442 732.28289 481.452412 \n", + "1 11.0 17.0 53.0 32.0 ... 481.850442 495.65289 244.822412 \n", + "2 21.0 18.0 29.0 32.0 ... 61.980442 75.78289 175.047588 \n", + "3 9.0 16.0 14.0 19.0 ... 765.000442 778.80289 527.972412 \n", + "4 4.0 15.0 50.0 16.0 ... 627.470442 641.27289 390.442412 \n", + "... ... ... ... ... ... ... ... ... \n", + "99992 20.0 12.0 23.0 53.0 ... 90.650442 104.45289 146.377588 \n", + "99993 24.0 19.0 30.0 18.0 ... 36.710442 50.51289 200.317588 \n", + "99994 8.0 19.0 51.0 10.0 ... 73.500442 87.30289 163.527588 \n", + "99995 14.0 23.0 25.0 7.0 ... 89.480442 103.28289 147.547588 \n", + "99996 14.0 0.0 42.0 0.0 ... 93.140442 106.94289 143.887588 \n", "\n", - " dist_4 dist_5 dist_6 dist_7 dist_8 \\\n", - "0 801.755964 713.663595 740.010607 782.187553 5191.287484 \n", - "1 565.125964 477.033595 503.380607 545.557553 5427.917484 \n", - "2 145.255964 57.163595 83.510607 125.687553 5847.787484 \n", - "3 848.275964 760.183595 786.530607 828.707553 5144.767484 \n", - "4 710.745964 622.653595 649.000607 691.177553 5282.297484 \n", - "... ... ... ... ... ... \n", - "99992 173.925964 85.833595 112.180607 154.357553 5819.117484 \n", - "99993 119.985964 31.893595 58.240607 100.417553 5873.057484 \n", - "99994 156.775964 68.683595 95.030607 137.207553 5836.267484 \n", - "99995 172.755964 84.663595 111.010607 153.187553 5820.287484 \n", - "99996 176.415964 88.323595 114.670607 156.847553 5816.627484 \n", + " 12_dist 13_dist 14_dist 15_dist 16_dist \\\n", + "0 806.309441 628.520803 336.493914 2080.413247 458.031129 \n", + "1 569.679441 391.890803 99.863914 2317.043247 221.401129 \n", + "2 149.809441 27.979197 320.006086 2736.913247 198.468871 \n", + "3 852.829441 675.040803 383.013914 2033.893247 504.551129 \n", + "4 715.299441 537.510803 245.483914 2171.423247 367.021129 \n", + "... ... ... ... ... ... \n", + "99992 178.479441 0.690803 291.336086 2708.243247 169.798871 \n", + "99993 124.539441 53.249197 345.276086 2762.183247 223.738871 \n", + "99994 161.329441 16.459197 308.486086 2725.393247 186.948871 \n", + "99995 177.309441 0.479197 292.506086 2709.413247 170.968871 \n", + "99996 180.969441 3.180803 288.846086 2705.753247 167.308871 \n", "\n", - " dist_9 transaction_id \n", - "0 718.480442 0 \n", - "1 481.850442 1 \n", - "2 61.980442 2 \n", - "3 765.000442 3 \n", - "4 627.470442 4 \n", - "... ... ... \n", - "99992 90.650442 99992 \n", - "99993 36.710442 99993 \n", - "99994 73.500442 99994 \n", - "99995 89.480442 99995 \n", - "99996 93.140442 99996 \n", + " 17_dist 18_dist \n", + "0 51.867801 627.802849 \n", + "1 184.762199 391.172849 \n", + "2 604.632199 28.697151 \n", + "3 98.387801 674.322849 \n", + "4 39.142199 536.792849 \n", + "... ... ... \n", + "99992 575.962199 0.027151 \n", + "99993 629.902199 53.967151 \n", + "99994 593.112199 17.177151 \n", + "99995 577.132199 1.197151 \n", + "99996 573.472199 2.462849 \n", "\n", - "[99997 rows x 30 columns]" + "[99997 rows x 29 columns]" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import mlrun.feature_store as fstore\n", - "data = extended_transactions_set.ingest(data, overwrite=True)\n", + "additional_features = pd.pivot_table(\n", + " feature_store_data, values=[\"mean_amount\"], index=[\"identifier\"]\n", + ").T.add_suffix(\"_dist\")\n", + "additional_features_columns = list(additional_features.columns)\n", + "data = pd.concat([data, pd.DataFrame(columns=additional_features_columns, dtype=object)])\n", + "data[additional_features_columns] = additional_features.values[0]\n", + "for col in additional_features_columns:\n", + " data[col] = abs(data[col] - data[\"amount\"])\n", + "\n", "data" ] }, { - "cell_type": "markdown", - "id": "b5e4834e", - "metadata": {}, - "source": [ - "We update the values in the feature store with the real values of our data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" - ] - }, - { - "cell_type": "markdown", - "id": "cf148985", - "metadata": {}, - "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" - ] - }, - { + "attachments": {}, "cell_type": "markdown", "id": "289eeca6", "metadata": {}, @@ -977,7 +1419,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "bb4bdd8d", "metadata": {}, "outputs": [], @@ -989,6 +1431,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f81f65b9", "metadata": {}, @@ -998,7 +1441,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -1009,6 +1452,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "de669936", "metadata": {}, @@ -1018,7 +1462,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -1035,6 +1479,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "22de532f", "metadata": {}, @@ -1044,7 +1489,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -1053,6 +1498,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "66cae2a9", "metadata": {}, @@ -1062,7 +1508,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1076,6 +1522,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6f2985d8", "metadata": {}, @@ -1085,7 +1532,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1101,6 +1548,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ecafdfe8", "metadata": {}, @@ -1110,7 +1558,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1129,6 +1577,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b36463dd", "metadata": {}, @@ -1138,7 +1587,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "c24e06fc", "metadata": { "scrolled": true @@ -1148,143 +1597,143 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-25-14-12-01-149\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-12-13-21-36-187\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-01-25 14:12:01 Starting - Starting the training job...\n", - "2024-01-25 14:12:18 Starting - Preparing the instances for training.........\n", - "2024-01-25 14:13:58 Downloading - Downloading input data......\n", - "2024-01-25 14:14:34 Downloading - Downloading the training image...\n", - "2024-01-25 14:15:29 Training - Training image download completed. Training in progress...\u001B[34m[2024-01-25 14:15:41.041 ip-10-2-106-129.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Imported framework sagemaker_xgboost_container.training\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001B[0m\n", - "\u001B[34mReturning the value itself\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] No GPUs detected (normal if no gpus installed)\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Running XGBoost Sagemaker in algorithm mode\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Single node training.\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Train matrix has 69997 rows and 29 columns\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Validation matrix has 20000 rows\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:41.342 ip-10-2-106-129.ec2.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001B[0m\n", - "\u001B[34m[2024-01-25:14:15:41:INFO] Debug hook created from config\u001B[0m\n", - "\u001B[34m[0]#011train-merror:0.00047#011validation-merror:0.00050\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:42.380 ip-10-2-106-129.ec2.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001B[0m\n", - "\u001B[34m[2024-01-25 14:15:42.383 ip-10-2-106-129.ec2.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001B[0m\n", - "\u001B[34m[1]#011train-merror:0.00023#011validation-merror:0.00040\u001B[0m\n", - "\u001B[34m[2]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[3]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[4]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[5]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[6]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[7]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[8]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[9]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[10]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[11]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[12]#011train-merror:0.00001#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[13]#011train-merror:0.00001#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[14]#011train-merror:0.00001#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[15]#011train-merror:0.00001#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[16]#011train-merror:0.00001#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[17]#011train-merror:0.00001#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[18]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[19]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[20]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[21]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[22]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[23]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[24]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[25]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[26]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[27]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[28]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[29]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[30]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[31]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[32]#011train-merror:0.00000#011validation-merror:0.00015\u001B[0m\n", - "\u001B[34m[33]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[34]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[35]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[36]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[37]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[38]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[39]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[40]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[41]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[42]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[43]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[44]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[45]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[46]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[47]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[48]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[49]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[50]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[51]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[52]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[53]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[54]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[55]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[56]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[57]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[58]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[59]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[60]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[61]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[62]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[63]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[64]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[65]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[66]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[67]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[68]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[69]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[70]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[71]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[72]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[73]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[74]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[75]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[76]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[77]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[78]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[79]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[80]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[81]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[82]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[83]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[84]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[85]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[86]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[87]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[88]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[89]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", + "2024-02-12 13:21:36 Starting - Starting the training job...\n", + "2024-02-12 13:21:52 Starting - Preparing the instances for training......\n", + "2024-02-12 13:22:57 Downloading - Downloading input data......\n", + "2024-02-12 13:23:37 Downloading - Downloading the training image...\n", + "2024-02-12 13:24:23 Training - Training image download completed. Training in progress....\u001b[34m[2024-02-12 13:24:37.269 ip-10-0-177-37.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Train matrix has 69997 rows and 28 columns\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:37.624 ip-10-0-177-37.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:37.624 ip-10-0-177-37.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:37.625 ip-10-0-177-37.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:37.625 ip-10-0-177-37.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:37.626 ip-10-0-177-37.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-12:13:24:37:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.54809#011validation-merror:0.56205\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:39.418 ip-10-0-177-37.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-12 13:24:39.421 ip-10-0-177-37.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.52881#011validation-merror:0.54025\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.51631#011validation-merror:0.53010\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.50496#011validation-merror:0.52125\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.49649#011validation-merror:0.51125\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.49515#011validation-merror:0.51135\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.49075#011validation-merror:0.50570\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.48436#011validation-merror:0.50025\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.47908#011validation-merror:0.49355\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.47491#011validation-merror:0.48925\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.47348#011validation-merror:0.48725\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.47023#011validation-merror:0.48375\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.46452#011validation-merror:0.47650\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.45632#011validation-merror:0.46575\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.45106#011validation-merror:0.46095\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.44463#011validation-merror:0.45505\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.44115#011validation-merror:0.45115\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.43572#011validation-merror:0.44560\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.43310#011validation-merror:0.44285\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.42675#011validation-merror:0.43620\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.41697#011validation-merror:0.42690\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.40773#011validation-merror:0.41710\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.39490#011validation-merror:0.40325\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.38994#011validation-merror:0.39810\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.38683#011validation-merror:0.39415\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.37924#011validation-merror:0.38710\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.37102#011validation-merror:0.37915\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.36164#011validation-merror:0.36975\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.35730#011validation-merror:0.36610\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.35083#011validation-merror:0.36000\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.34436#011validation-merror:0.35325\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.33466#011validation-merror:0.34165\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.32956#011validation-merror:0.33610\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.32551#011validation-merror:0.33195\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.31620#011validation-merror:0.32330\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.31168#011validation-merror:0.31940\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.30877#011validation-merror:0.31530\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.30557#011validation-merror:0.31230\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.29613#011validation-merror:0.30190\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.29147#011validation-merror:0.29620\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.28283#011validation-merror:0.28695\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.27641#011validation-merror:0.28100\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.27238#011validation-merror:0.27690\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.26728#011validation-merror:0.27205\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.26151#011validation-merror:0.26660\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.25514#011validation-merror:0.25815\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.25094#011validation-merror:0.25390\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.24552#011validation-merror:0.24770\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.24063#011validation-merror:0.24275\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.23564#011validation-merror:0.23870\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.23431#011validation-merror:0.23740\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.22709#011validation-merror:0.23020\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.22091#011validation-merror:0.22380\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.21611#011validation-merror:0.21880\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.20878#011validation-merror:0.21190\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.20377#011validation-merror:0.20710\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.20087#011validation-merror:0.20430\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.19719#011validation-merror:0.20015\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.19445#011validation-merror:0.19750\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.19131#011validation-merror:0.19420\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.18908#011validation-merror:0.19210\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.18516#011validation-merror:0.18885\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.18132#011validation-merror:0.18475\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.17795#011validation-merror:0.18185\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.17456#011validation-merror:0.17820\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.17232#011validation-merror:0.17655\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.16769#011validation-merror:0.17095\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.16395#011validation-merror:0.16710\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.16091#011validation-merror:0.16495\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.15839#011validation-merror:0.16225\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.15706#011validation-merror:0.16140\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.15382#011validation-merror:0.15725\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.15138#011validation-merror:0.15495\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.14889#011validation-merror:0.15315\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.14601#011validation-merror:0.14925\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.14246#011validation-merror:0.14550\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.13822#011validation-merror:0.14125\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.13733#011validation-merror:0.14060\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.13628#011validation-merror:0.13935\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.13438#011validation-merror:0.13840\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.13035#011validation-merror:0.13430\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.12919#011validation-merror:0.13325\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.12453#011validation-merror:0.12970\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.12369#011validation-merror:0.12905\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.12163#011validation-merror:0.12690\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.12063#011validation-merror:0.12595\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.11888#011validation-merror:0.12405\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.11849#011validation-merror:0.12330\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.11685#011validation-merror:0.12215\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.11521#011validation-merror:0.12095\u001b[0m\n", + "\u001b[34m[90]#011train-merror:0.11383#011validation-merror:0.11945\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.11163#011validation-merror:0.11725\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.11035#011validation-merror:0.11615\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.10860#011validation-merror:0.11480\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.10619#011validation-merror:0.11245\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.10453#011validation-merror:0.11095\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.10273#011validation-merror:0.10900\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.10186#011validation-merror:0.10835\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.10068#011validation-merror:0.10740\u001b[0m\n", "\n", - "2024-01-25 14:17:00 Uploading - Uploading generated training model\u001B[34m[90]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[91]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[92]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[93]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[94]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[95]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[96]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[97]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[98]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", - "\u001B[34m[99]#011train-merror:0.00000#011validation-merror:0.00010\u001B[0m\n", + "2024-02-12 13:27:29 Uploading - Uploading generated training model\u001b[34m[99]#011train-merror:0.09996#011validation-merror:0.10595\u001b[0m\n", "\n", - "2024-01-25 14:17:16 Completed - Training job completed\n", - "Training seconds: 198\n", - "Billable seconds: 198\n" + "2024-02-12 13:27:40 Completed - Training job completed\n", + "Training seconds: 282\n", + "Billable seconds: 282\n" ] } ], @@ -1293,6 +1742,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8b716cd7", "metadata": {}, @@ -1304,8 +1754,33 @@ }, { "cell_type": "code", - "execution_count": 26, - "id": "78444d49-4ad3-49e4-a579-19b173facb26", + "execution_count": 29, + "id": "ded8301b-2523-4ad7-bba2-30e0ca748253", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-12 13:28:20,985 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"sagemaker\", \n", + " user_project=True,\n", + " parameters={\n", + " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", + " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "fcb620d5-0697-4860-adc3-2696a9e1fa66", "metadata": {}, "outputs": [], "source": [ @@ -1314,8 +1789,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", + "execution_count": 31, + "id": "e1b53d0c-baee-45f3-be07-6c02af75a6c8", "metadata": {}, "outputs": [ { @@ -1324,52 +1799,52 @@ "\n", "\n", - "\n", "\n", - "\n", + "\n", "\n", "mlrun-flow\n", - "\n", + "\n", "\n", "\n", "_start\n", - "\n", - "start\n", + "\n", + "start\n", "\n", "\n", "\n", "xgboost-model\n", - "\n", - "xgboost-model\n", + "\n", + "xgboost-model\n", "\n", "\n", "\n", "_start->xgboost-model\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "postprocess\n", - "\n", - "postprocess\n", + "\n", + "postprocess\n", "\n", "\n", "\n", "xgboost-model->postprocess\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 27, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1390,31 +1865,95 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", + "execution_count": 32, + "id": "2963f0b2-5169-4138-aefb-cef8813b0e0e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-12 13:28:21,518 [info] model xgboost-model was loaded\n" + ] + } + ], + "source": [ + "server = serving_function.to_mock_server()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ffa8dda2-ce3d-459c-8a93-f7ebaee7b7f8", + "metadata": {}, + "outputs": [], + "source": [ + "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "093d87f6-312f-452b-8e7d-1bd613e189ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-12 13:28:21,593 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//predict'}\n" + ] + } + ], + "source": [ + "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "a70a144b-f9cd-4e2e-b1e3-16163b56835a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 11, 3, 1, 3, 3, 5, 1, 1, 3, 3, 17, 7, 2, 13, 9, 7, 3, 10, 1, 7, 1, 10, 10, 6, 3, 5, 3, 3, 1, 12, 9, 12, 6, 1, 10, 10, 10, 10, 1, 3, 3, 1, 3, 9, 1, 10, 3, 4, 3, 3, 10, 1, 3, 10, 3, 3, 17, 3, 6, 3, 3, 3, 3, 1, 3, 1, 1, 6, 3, 10, 3, 5, 1, 10, 3, 10, 10, 10, 10, 14, 1, 12, 3, 7, 1, 3, 12, 3, 10, 5, 3, 3, 1, 1, 6, 1, 10, 10, 3, 3, 1, 3, 3, 6, 3, 3, 3, 1, 10, 3, 3, 1, 5, 10, 1, 3, 1, 3, 10, 3, 3, 11, 12, 3, 3, 5, 1, 3, 3, 1, 3, 1, 10, 3, 1, 3, 3, 18, 13, 3, 1, 1, 10, 3, 1, 3, 1, 10, 6, 3, 10, 10, 3, 3, 3, 6, 1, 3, 10, 10, 1, 3, 17, 1, 3, 3, 3, 3, 17, 3, 3, 7, 3, 6, 10, 3, 3, 3, 10, 3, 6, 3, 3, 10, 11, 10, 10, 1, 10, 10, 5, 12, 5, 13, 3, 10, 3, 3, 5, 1, 9, 1, 3, 9, 3, 10, 10, 10, 9, 3, 3, 3, 10, 1, 2, 1, 1, 6, 3, 1, 6, 6, 7, 10, 10, 3, 3, 6, 1, 9, 3, 3, 3, 3, 10, 3, 6, 10, 1, 3, 17, 1, 0, 10, 3, 9, 17, 2, 1, 10, 3, 3, 3, 10, 6, 3, 3, 3, 3, 3, 14, 1, 3, 6, 10, 1, 3, 3, 1, 3, 10, 3, 3, 10, 7, 10, 10, 10, 18, 5, 13, 3, 10, 3, 10, 3, 3, 3, 10, 1, 5, 3, 1, 3, 1, 3, 3, 5, 3, 1, 3, 3, 1, 10, 18, 6, 3, 3, 10, 3, 3, 1, 7, 3, 1, 3, 6, 1, 9, 3, 3, 3, 10, 10, 3, 17, 3, 1, 10, 1, 10, 15, 1, 3, 3, 1, 1, 14, 11, 9, 10, 10, 6, 1, 4, 1, 3, 17, 3, 3, 3, 1, 6, 14, 3, 3, 9, 1, 10, 6, 5, 3, 13, 3, 1, 3, 3, 6, 3, 17, 13, 17, 12, 1, 1, 10, 13, 5, 3, 3, 3, 3, 10, 13, 18, 3, 3, 3, 3, 10, 10, 3, 8, 1, 3, 9, 3, 10, 3, 1, 10, 10, 1, 3, 3, 3, 10, 1, 1, 3, 7, 1, 10, 6, 3, 3, 3, 3, 3, 3, 9, 10, 3, 10, 9, 3, 3, 17, 10, 3, 7, 1, 1, 3, 1, 17, 7, 3, 3, 3, 3, 9, 3, 3, 1, 10, 10, 3, 3, 1, 7, 10, 10, 10, 12, 6, 3, 6, 16, 10, 3, 1, 18, 8, 10, 6, 10, 17, 6, 1, 10, 1, 3, 10, 1, 3, 3, 10, 3, 10, 3, 3, 5, 7, 6, 3, 6, 5, 3, 3, 3, 3, 2, 10, 12, 9, 1, 3]\n" + ] + } + ], + "source": [ + "print(response['predictions'])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "815fd915-e8ff-455a-a16b-dbf470f8d825", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:17:46,696 [info] Starting remote function deploy\n", - "2024-01-25 14:17:46 (info) Deploying function\n", - "2024-01-25 14:17:46 (info) Building\n", - "2024-01-25 14:17:47 (info) Staging files and preparing base images\n", - "2024-01-25 14:17:47 (info) Building processor image\n", - "2024-01-25 14:19:32 (info) Build complete\n", - "2024-01-25 14:19:40 (info) Function deploy complete\n", - "> 2024-01-25 14:19:48,105 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/']}\n" + "> 2024-02-12 13:28:21,673 [info] Starting remote function deploy\n", + "2024-02-12 13:28:22 (info) Deploying function\n", + "2024-02-12 13:28:22 (info) Building\n", + "2024-02-12 13:28:22 (info) Staging files and preparing base images\n", + "2024-02-12 13:28:22 (info) Building processor image\n", + "2024-02-12 13:29:27 (info) Build complete\n", + "2024-02-12 13:29:36 (info) Function deploy complete\n", + "> 2024-02-12 13:29:43,324 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-admin-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" ] }, { "data": { "text/plain": [ - "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/', 'name': 'sagemaker-yoni-serving'})" + "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-admin-serving'})" ] }, - "execution_count": 28, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -1425,35 +1964,43 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "c858e3e9-9e43-4148-8015-6047565db456", + "execution_count": 37, + "id": "0bd727dc-7243-4d0c-94b7-91dcaeca3fae", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-02-12 13:29:43,392 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//predict'}\n" + ] + } + ], "source": [ - "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" + "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" ] }, { "cell_type": "code", - "execution_count": 30, - "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", + "execution_count": 38, + "id": "b6e22e08-9fa3-4fc9-a084-652edf9fdfb7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:19:48,167 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080/predict'}\n" + "[1, 1, 11, 3, 1, 3, 3, 5, 1, 1, 3, 3, 17, 7, 2, 13, 9, 7, 3, 10, 1, 7, 1, 10, 10, 6, 3, 5, 3, 3, 1, 12, 9, 12, 6, 1, 10, 10, 10, 10, 1, 3, 3, 1, 3, 9, 1, 10, 3, 4, 3, 3, 10, 1, 3, 10, 3, 3, 17, 3, 6, 3, 3, 3, 3, 1, 3, 1, 1, 6, 3, 10, 3, 5, 1, 10, 3, 10, 10, 10, 10, 14, 1, 12, 3, 7, 1, 3, 12, 3, 10, 5, 3, 3, 1, 1, 6, 1, 10, 10, 3, 3, 1, 3, 3, 6, 3, 3, 3, 1, 10, 3, 3, 1, 5, 10, 1, 3, 1, 3, 10, 3, 3, 11, 12, 3, 3, 5, 1, 3, 3, 1, 3, 1, 10, 3, 1, 3, 3, 18, 13, 3, 1, 1, 10, 3, 1, 3, 1, 10, 6, 3, 10, 10, 3, 3, 3, 6, 1, 3, 10, 10, 1, 3, 17, 1, 3, 3, 3, 3, 17, 3, 3, 7, 3, 6, 10, 3, 3, 3, 10, 3, 6, 3, 3, 10, 11, 10, 10, 1, 10, 10, 5, 12, 5, 13, 3, 10, 3, 3, 5, 1, 9, 1, 3, 9, 3, 10, 10, 10, 9, 3, 3, 3, 10, 1, 2, 1, 1, 6, 3, 1, 6, 6, 7, 10, 10, 3, 3, 6, 1, 9, 3, 3, 3, 3, 10, 3, 6, 10, 1, 3, 17, 1, 0, 10, 3, 9, 17, 2, 1, 10, 3, 3, 3, 10, 6, 3, 3, 3, 3, 3, 14, 1, 3, 6, 10, 1, 3, 3, 1, 3, 10, 3, 3, 10, 7, 10, 10, 10, 18, 5, 13, 3, 10, 3, 10, 3, 3, 3, 10, 1, 5, 3, 1, 3, 1, 3, 3, 5, 3, 1, 3, 3, 1, 10, 18, 6, 3, 3, 10, 3, 3, 1, 7, 3, 1, 3, 6, 1, 9, 3, 3, 3, 10, 10, 3, 17, 3, 1, 10, 1, 10, 15, 1, 3, 3, 1, 1, 14, 11, 9, 10, 10, 6, 1, 4, 1, 3, 17, 3, 3, 3, 1, 6, 14, 3, 3, 9, 1, 10, 6, 5, 3, 13, 3, 1, 3, 3, 6, 3, 17, 13, 17, 12, 1, 1, 10, 13, 5, 3, 3, 3, 3, 10, 13, 18, 3, 3, 3, 3, 10, 10, 3, 8, 1, 3, 9, 3, 10, 3, 1, 10, 10, 1, 3, 3, 3, 10, 1, 1, 3, 7, 1, 10, 6, 3, 3, 3, 3, 3, 3, 9, 10, 3, 10, 9, 3, 3, 17, 10, 3, 7, 1, 1, 3, 1, 17, 7, 3, 3, 3, 3, 9, 3, 3, 1, 10, 10, 3, 3, 1, 7, 10, 10, 10, 12, 6, 3, 6, 16, 10, 3, 1, 18, 8, 10, 6, 10, 17, 6, 1, 10, 1, 3, 10, 1, 3, 3, 10, 3, 10, 3, 3, 5, 7, 6, 3, 6, 5, 3, 3, 3, 3, 2, 10, 12, 9, 1, 3]\n" ] } ], "source": [ - "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" + "print(response['predictions'])" ] }, { "cell_type": "markdown", - "id": "712f4d35", + "id": "7a93919f-da79-455c-a8c2-eab5deece6d9", "metadata": {}, "source": [ "### 6. Evaluate performance \n", @@ -1463,8 +2010,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "id": "2e863ea7-5804-4637-b677-390c305cabfe", + "execution_count": 39, + "id": "6b6a92c2-09f2-4e4a-9c3a-1c005e231d1b", "metadata": {}, "outputs": [], "source": [ @@ -1472,44 +2019,70 @@ ] }, { - "cell_type": "markdown", - "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", + "cell_type": "code", + "execution_count": 40, + "id": "84c6a5b7", "metadata": {}, + "outputs": [], "source": [ - "Add the evaluation function to our project" + "evaluate_function = project.get_function(\"evaluate\")" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "ca4f7e49", + "execution_count": 41, + "id": "cdb2def1-c771-4757-a581-41c0014f2678", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-12-13-21-36-187/output/model.tar.gz'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "evaluate_function = project.get_function(\"evaluate\")" + "xgb.model_data" ] }, { - "cell_type": "markdown", - "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", + "cell_type": "code", + "execution_count": 42, + "id": "fe92026c-c9e3-4e06-8b75-894db62f8f6a", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv'" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Run the evaluation job" + "s3_data" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", + "execution_count": 43, + "id": "62b14992-24db-4be2-bcb3-dbd115fa7544", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:19:48,410 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': 'cac9cd3c55ba40d58fbe1156d4861e79', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-01-25 14:19:48,708 [info] Job is running in the background, pod: evaluate-evaluate-5rrtk\n", - "[14:19:52] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", + "> 2024-02-12 13:29:43,661 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '676e68621dd34bcd89b138f74954a4d5', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", + "> 2024-02-12 13:29:44,057 [info] Job is running in the background, pod: evaluate-evaluate-bdkpj\n", + "[13:29:48] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", "configuration generated by an older version of XGBoost, please export the model by calling\n", "`Booster.save_model` from that version first, then load it back in current version. See:\n", "\n", @@ -1517,9 +2090,23 @@ "\n", "for more details about differences between saving model and serializing.\n", "\n", - "> 2024-01-25 14:19:53,802 [info] To track results use the CLI: {'info_cmd': 'mlrun get run cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni', 'logs_cmd': 'mlrun logs cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni'}\n", - "> 2024-01-25 14:19:53,802 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/sagemaker-yoni/jobs/monitor/cac9cd3c55ba40d58fbe1156d4861e79/overview'}\n", - "> 2024-01-25 14:19:53,803 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + " transaction_category receiver_id ... 17_dist 18_dist\n", + "0 3 4.630518e+15 ... 714.212199 138.277151\n", + "1 1 4.558838e+15 ... 761.252199 185.317151\n", + "2 8 4.542379e+15 ... 177.982199 397.952849\n", + "3 3 4.892899e+15 ... 644.632199 68.697151\n", + "4 1 4.282494e+15 ... 688.222199 112.287151\n", + "... ... ... ... ... ...\n", + "9995 1 4.979209e+15 ... 744.162199 168.227151\n", + "9996 5 4.481907e+15 ... 692.862199 116.927151\n", + "9997 10 4.210580e+15 ... 649.782199 73.847151\n", + "9998 1 4.542105e+15 ... 727.272199 151.337151\n", + "9999 3 4.486975e+15 ... 766.152199 190.217151\n", + "\n", + "[10000 rows x 29 columns]\n", + "> 2024-02-12 13:29:49,267 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 676e68621dd34bcd89b138f74954a4d5 -p sagemaker-admin', 'logs_cmd': 'mlrun logs 676e68621dd34bcd89b138f74954a4d5 -p sagemaker-admin'}\n", + "> 2024-02-12 13:29:49,267 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-admin/jobs/monitor/676e68621dd34bcd89b138f74954a4d5/overview'}\n", + "> 2024-02-12 13:29:49,267 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" ] }, { @@ -1692,27 +2279,27 @@ " \n", " \n", "
\n", - " sagemaker-yoni\n", - "
\n", + " sagemaker-admin\n", + " \n", " 0\n", - " Jan 25 14:19:51\n", + " Feb 12 13:29:47\n", " completed\n", " evaluate-evaluate\n", - "
v3io_user=yoni
kind=job
owner=yoni
mlrun/client_version=1.6.0-rc21
mlrun/client_python_version=3.9.16
host=evaluate-evaluate-5rrtk
\n", + "
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-bdkpj
\n", " \n", - "
model_path=s3://sagemaker-us-east-1-934638699319/payment-classification/output/sagemaker-xgboost-2024-01-25-14-12-01-149/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-1-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
\n", + "
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-12-13-21-36-187/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
\n", " \n", - "
classification_report
\n", + "
classification_report
\n", " \n", " \n", "\n", "\n", - "
\n", + "
\n", "
\n", - " Title\n", - " ×\n", + " Title\n", + " ×\n", "
\n", - " \n", + " \n", "
\n", "
\n" ], @@ -1733,7 +2320,7 @@ { "data": { "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" + " > to track results use the .show() or .logs() methods or click here to open in UI" ], "text/plain": [ "" @@ -1746,7 +2333,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-25 14:19:59,831 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + "> 2024-02-12 13:29:55,177 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" ] } ], @@ -1763,18 +2350,10 @@ " returns=[\"classification_report: dataset\"])" ] }, - { - "cell_type": "markdown", - "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", - "metadata": {}, - "source": [ - "See the evaluation result" - ] - }, { "cell_type": "code", - "execution_count": 34, - "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", + "execution_count": 44, + "id": "a40da196-0ee8-4ac6-9f7b-bea35ab181c9", "metadata": {}, "outputs": [ { @@ -1808,188 +2387,188 @@ " \n", " Uncategorized\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 51.0000\n", + " 0.921569\n", + " 0.959184\n", + " 51.000\n", " \n", " \n", " Entertainment\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 1486.0000\n", + " 0.795745\n", + " 0.880888\n", + " 0.836155\n", + " 1486.000\n", " \n", " \n", " Education\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 80.0000\n", + " 0.937500\n", + " 0.967742\n", + " 80.000\n", " \n", " \n", " Shopping\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 3441.0000\n", + " 0.848892\n", + " 0.935484\n", + " 0.890087\n", + " 3441.000\n", " \n", " \n", " Personal Care\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 132.0000\n", + " 0.984848\n", + " 0.992366\n", + " 132.000\n", " \n", " \n", " Health and Fitness\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 443.0000\n", + " 0.987864\n", + " 0.918736\n", + " 0.952047\n", + " 443.000\n", " \n", " \n", " Food and Dining\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 918.0000\n", + " 0.989542\n", + " 0.824619\n", + " 0.899584\n", + " 918.000\n", " \n", " \n", " Gifts and Donations\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 275.0000\n", + " 0.996169\n", + " 0.945455\n", + " 0.970149\n", + " 275.000\n", " \n", " \n", " Investments\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 88.0000\n", + " 0.988235\n", + " 0.954545\n", + " 0.971098\n", + " 88.000\n", " \n", " \n", " Bills and Utilities\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 332.0000\n", + " 0.996979\n", + " 0.993976\n", + " 0.995475\n", + " 332.000\n", " \n", " \n", " Auto and Transport\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 1967.0000\n", + " 0.938012\n", + " 0.815455\n", + " 0.872450\n", + " 1967.000\n", " \n", " \n", " Travel\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 120.0000\n", + " 0.952381\n", + " 0.833333\n", + " 0.888889\n", + " 120.000\n", " \n", " \n", " Fees and Charges\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 106.0000\n", + " 0.952830\n", + " 0.975845\n", + " 106.000\n", " \n", " \n", " Business Services\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 146.0000\n", + " 0.993151\n", + " 0.996564\n", + " 146.000\n", " \n", " \n", " Personal Services\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 75.0000\n", + " 0.960000\n", + " 0.979592\n", + " 75.000\n", " \n", " \n", " Taxes\n", " 1.000000\n", - " 0.978723\n", - " 0.989247\n", - " 47.0000\n", + " 0.936170\n", + " 0.967033\n", + " 47.000\n", " \n", " \n", " Gambling\n", - " 0.937500\n", " 1.000000\n", - " 0.967742\n", - " 15.0000\n", + " 1.000000\n", + " 1.000000\n", + " 15.000\n", " \n", " \n", " Home\n", - " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 168.0000\n", + " 0.980519\n", + " 0.898810\n", + " 0.937888\n", + " 168.000\n", " \n", " \n", " Pension and insurances\n", + " 0.990991\n", " 1.000000\n", - " 1.000000\n", - " 1.000000\n", - " 110.0000\n", + " 0.995475\n", + " 110.000\n", " \n", " \n", " accuracy\n", - " 0.999900\n", - " 0.999900\n", - " 0.999900\n", - " 0.9999\n", + " 0.896000\n", + " 0.896000\n", + " 0.896000\n", + " 0.896\n", " \n", " \n", " macro avg\n", - " 0.996711\n", - " 0.998880\n", - " 0.997736\n", - " 10000.0000\n", + " 0.971859\n", + " 0.930914\n", + " 0.949875\n", + " 10000.000\n", " \n", " \n", " weighted avg\n", - " 0.999906\n", - " 0.999900\n", - " 0.999901\n", - " 10000.0000\n", + " 0.902654\n", + " 0.896000\n", + " 0.896567\n", + " 10000.000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " precision recall f1-score support\n", - "Uncategorized 1.000000 1.000000 1.000000 51.0000\n", - "Entertainment 1.000000 1.000000 1.000000 1486.0000\n", - "Education 1.000000 1.000000 1.000000 80.0000\n", - "Shopping 1.000000 1.000000 1.000000 3441.0000\n", - "Personal Care 1.000000 1.000000 1.000000 132.0000\n", - "Health and Fitness 1.000000 1.000000 1.000000 443.0000\n", - "Food and Dining 1.000000 1.000000 1.000000 918.0000\n", - "Gifts and Donations 1.000000 1.000000 1.000000 275.0000\n", - "Investments 1.000000 1.000000 1.000000 88.0000\n", - "Bills and Utilities 1.000000 1.000000 1.000000 332.0000\n", - "Auto and Transport 1.000000 1.000000 1.000000 1967.0000\n", - "Travel 1.000000 1.000000 1.000000 120.0000\n", - "Fees and Charges 1.000000 1.000000 1.000000 106.0000\n", - "Business Services 1.000000 1.000000 1.000000 146.0000\n", - "Personal Services 1.000000 1.000000 1.000000 75.0000\n", - "Taxes 1.000000 0.978723 0.989247 47.0000\n", - "Gambling 0.937500 1.000000 0.967742 15.0000\n", - "Home 1.000000 1.000000 1.000000 168.0000\n", - "Pension and insurances 1.000000 1.000000 1.000000 110.0000\n", - "accuracy 0.999900 0.999900 0.999900 0.9999\n", - "macro avg 0.996711 0.998880 0.997736 10000.0000\n", - "weighted avg 0.999906 0.999900 0.999901 10000.0000" + " precision recall f1-score support\n", + "Uncategorized 1.000000 0.921569 0.959184 51.000\n", + "Entertainment 0.795745 0.880888 0.836155 1486.000\n", + "Education 1.000000 0.937500 0.967742 80.000\n", + "Shopping 0.848892 0.935484 0.890087 3441.000\n", + "Personal Care 1.000000 0.984848 0.992366 132.000\n", + "Health and Fitness 0.987864 0.918736 0.952047 443.000\n", + "Food and Dining 0.989542 0.824619 0.899584 918.000\n", + "Gifts and Donations 0.996169 0.945455 0.970149 275.000\n", + "Investments 0.988235 0.954545 0.971098 88.000\n", + "Bills and Utilities 0.996979 0.993976 0.995475 332.000\n", + "Auto and Transport 0.938012 0.815455 0.872450 1967.000\n", + "Travel 0.952381 0.833333 0.888889 120.000\n", + "Fees and Charges 1.000000 0.952830 0.975845 106.000\n", + "Business Services 1.000000 0.993151 0.996564 146.000\n", + "Personal Services 1.000000 0.960000 0.979592 75.000\n", + "Taxes 1.000000 0.936170 0.967033 47.000\n", + "Gambling 1.000000 1.000000 1.000000 15.000\n", + "Home 0.980519 0.898810 0.937888 168.000\n", + "Pension and insurances 0.990991 1.000000 0.995475 110.000\n", + "accuracy 0.896000 0.896000 0.896000 0.896\n", + "macro avg 0.971859 0.930914 0.949875 10000.000\n", + "weighted avg 0.902654 0.896000 0.896567 10000.000" ] }, - "execution_count": 34, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1999,15 +2578,10 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "98d0b67e", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, "source": [ "You should see results similar to this:\n", "\n", @@ -2041,6 +2615,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "49fdc82d", "metadata": {}, @@ -2052,25 +2627,20 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 47, "id": "f79b1164", "metadata": {}, "outputs": [], "source": [ - "#feature_group.delete()\n", + "feature_group.delete()\n", "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e04b6fa6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, + "metadata": {}, "source": [ "## Notebook CI Test Results\n", "\n", @@ -2111,9 +2681,9 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "mlrun-base", + "display_name": "smdemo", "language": "python", - "name": "conda-env-mlrun-base-py" + "name": "smdemo" }, "language_info": { "codemirror_mode": { @@ -2125,7 +2695,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/financial_payment_classification_v3.ipynb b/financial_payment_classification_v3.ipynb deleted file mode 100644 index dd598ad..0000000 --- a/financial_payment_classification_v3.ipynb +++ /dev/null @@ -1,3564 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "01b5c703", - "metadata": {}, - "source": [ - "# SageMaker Payment Classification \n" - ] - }, - { - "cell_type": "markdown", - "id": "6498f087", - "metadata": {}, - "source": [ - "---\n", - "\n", - "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", - "\n", - "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "id": "c2e49281", - "metadata": {}, - "source": [ - "\n", - "## Background \n", - "\n", - "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", - "\n", - "\n", - "## Notebook overview \n", - "\n", - "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", - "\n", - "## Dataset \n", - "\n", - "For this notebook we use a synthetic dataset. This dataset has the following features \n", - "\n", - "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", - "\n", - " 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances'\n", - "\n", - "\n", - "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", - "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", - "* __amount__: the amount which is transferred.\n", - "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", - "\n", - "\n", - "### 1. Setup \n", - "\n", - "Before we start we need to update the sagemaker library" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fff19d6b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", - "# !{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", - "# !{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:21:51,304 [info] Identified pre-initialized git repo, using it: {'url': 'git://github.com/aviaIguazio/demo-sagemaker.git#refs/heads/development'}\n", - "> 2024-02-11 16:22:06,708 [info] Created and saved project: {'name': 'sagemaker-v3-admin', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n", - "> 2024-02-11 16:22:07,592 [info] Project created successfully: {'project_name': 'sagemaker-v3', 'stored_in_db': True}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v3\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b17a94d", - "metadata": {}, - "source": [ - "Now that we have the latest version we can import the libraries that we'll use in this notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "42c5d6d0", - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import io\n", - "import sagemaker\n", - "import time\n", - "import os\n", - "from sklearn.metrics import classification_report\n", - "import pandas as pd\n", - "import numpy as np\n", - "from datetime import datetime, timedelta" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", - "metadata": {}, - "outputs": [], - "source": [ - "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", - "metadata": {}, - "outputs": [], - "source": [ - "sess = sagemaker.Session()\n", - "write_bucket = sess.default_bucket()\n", - "write_prefix = \"sagemaker-app-lab\"" - ] - }, - { - "cell_type": "markdown", - "id": "3af7c33d", - "metadata": {}, - "source": [ - "Let's set the session variables to ensure that SageMaker is configured correctly." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "c0e4db17", - "metadata": {}, - "outputs": [], - "source": [ - "region = sagemaker.Session().boto_region_name\n", - "sm_client = boto3.client(\"sagemaker\")\n", - "boto_session = boto3.Session(region_name=region)\n", - "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", - "role = sagemaker_role\n", - "bucket_prefix = \"payment-classification\"\n", - "s3_bucket = sagemaker_session.default_bucket()" - ] - }, - { - "cell_type": "markdown", - "id": "4fe6a975", - "metadata": {}, - "source": [ - "We define the factorize key which is used to map the '__transaction_category__' to numeric values" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "43946b9f", - "metadata": {}, - "outputs": [], - "source": [ - "factorize_key = {\n", - " \"Uncategorized\": 0,\n", - " \"Entertainment\": 1,\n", - " \"Education\": 2,\n", - " \"Shopping\": 3,\n", - " \"Personal Care\": 4,\n", - " \"Health and Fitness\": 5,\n", - " \"Food and Dining\": 6,\n", - " \"Gifts and Donations\": 7,\n", - " \"Investments\": 8,\n", - " \"Bills and Utilities\": 9,\n", - " \"Auto and Transport\": 10,\n", - " \"Travel\": 11,\n", - " \"Fees and Charges\": 12,\n", - " \"Business Services\": 13,\n", - " \"Personal Services\": 14,\n", - " \"Taxes\": 15,\n", - " \"Gambling\": 16,\n", - " \"Home\": 17,\n", - " \"Pension and insurances\": 18,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e3dc3c4", - "metadata": {}, - "source": [ - "### 2. Data preparation \n", - "\n", - "We ingest the simulated data from the public SageMaker S3 training database:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5ff0d280", - "metadata": {}, - "outputs": [], - "source": [ - "s3 = boto3.client(\"s3\")\n", - "s3.download_file(\n", - " f\"sagemaker-example-files-prod-{region}\",\n", - " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", - " \"financial_transactions_mini.csv\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "08578d93", - "metadata": {}, - "source": [ - "Let's start by loading the dataset from our csv file into a Pandas dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a477abd7", - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cf6be447", - "metadata": {}, - "source": [ - "The dataframe looks as follows:\n", - "\n", - "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", - "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", - "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", - "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", - "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", - "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", - "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", - "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", - "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", - "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", - "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", - "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8c15f00d-8f89-41ec-aa22-f23fc394d1b4", - "metadata": {}, - "outputs": [], - "source": [ - "from utils import update_timestamps\n", - "data=update_timestamps(data)" - ] - }, - { - "cell_type": "markdown", - "id": "b5492919", - "metadata": {}, - "source": [ - "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", - "metadata": {}, - "outputs": [], - "source": [ - "data['transaction_id']= data.reset_index().index " - ] - }, - { - "cell_type": "markdown", - "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", - "metadata": { - "tags": [] - }, - "source": [ - "### 3. Create feature store \n", - "\n", - "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " - ] - }, - { - "cell_type": "markdown", - "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", - "metadata": {}, - "source": [ - "#### feature-group-payment-classification" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", - "metadata": {}, - "outputs": [], - "source": [ - "# move category to the first column to match sagemaker label train convention\"\n", - "def pop_and_move_to_start(d, key):\n", - " # Pop the item if it exists, otherwise return None\n", - " value = d.pop(key, None)\n", - " if value is not None:\n", - " # Move the popped item to the start\n", - " d = {key: value, **d}\n", - " return d\n", - "\n", - "def calculate_category_distance(event): \n", - " column_name ='transaction_category_mapped'\n", - " event = pop_and_move_to_start(event,column_name)\n", - " category = event[column_name]\n", - " event['distance'] = abs(event['amount']-event['amount_avg_1d'])\n", - " #event['distance'] = abs(event['amount']/2)\n", - " \n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "4101c303-2da3-431b-9375-9fa1747070af", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "DateExtractor\n", - "\n", - "DateExtractor\n", - "\n", - "\n", - "\n", - "_start->DateExtractor\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "MapValues\n", - "\n", - "MapValues\n", - "\n", - "\n", - "\n", - "DateExtractor->MapValues\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "MapValues->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "calculate_category_distance\n", - "\n", - "calculate_category_distance\n", - "\n", - "\n", - "\n", - "Aggregates->calculate_category_distance\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "DropFeatures\n", - "\n", - "DropFeatures\n", - "\n", - "\n", - "\n", - "calculate_category_distance->DropFeatures\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet/parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "DropFeatures->parquet/parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql/nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "DropFeatures->nosql/nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "from mlrun.feature_store.steps import OneHotEncoder, MapValues, DateExtractor, DropFeatures\n", - "\n", - "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", - " entities=[fstore.Entity(\"transaction_category\")],\n", - " description=\"transactions feature set\")\n", - "# setting up the graph\n", - "extended_transactions_set.graph \\\n", - " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp')) \\\n", - " .to(MapValues({'transaction_category' : factorize_key}, with_original_features=True)) \\\n", - "\n", - "extended_transactions_set.add_aggregation(name='amount',column='amount',operations=['avg'],windows=['1d'],period='1h')\n", - "\n", - "extended_transactions_set.graph \\\n", - " .to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates') \\\n", - " .to(DropFeatures(features=['timestamp']))\n", - "\n", - "\n", - "extended_transactions_set.set_targets()\n", - "\n", - "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "53eb2151-447a-4eb0-be7f-a07f1cbea32d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamounttimestamptransaction_id
3Uncategorized45185519044999194457298962882528879.782024-02-07 07:55:08.1589333
7Uncategorized47579519156690804655296518888015801.222023-12-10 10:27:30.1589337
11Uncategorized45185519044999194910949333064003423.312024-01-17 03:59:02.15893311
15Uncategorized45185519044999194415760195692405382.732023-11-20 10:51:32.15893315
19Uncategorized40980889806929744412940106031926111.772023-11-06 07:33:38.15893319
.....................
99979Pension and insurances41796068600888494359198069543354302.102024-01-09 23:29:38.15893399979
99983Pension and insurances47515386207333054021524999937895115.892024-02-05 09:43:23.15893399983
99987Pension and insurances44050083552203244165276502284291207.082023-12-14 17:17:11.15893399987
99991Pension and insurances40921157888775434328901131757235355.582024-02-20 10:35:48.15893399991
99995Pension and insurances42620471944990064017367486513464204.262023-12-15 15:05:56.15893399995
\n", - "

24999 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "7 Uncategorized 4757951915669080 4655296518888015 801.22 \n", - "11 Uncategorized 4518551904499919 4910949333064003 423.31 \n", - "15 Uncategorized 4518551904499919 4415760195692405 382.73 \n", - "19 Uncategorized 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... ... \n", - "99979 Pension and insurances 4179606860088849 4359198069543354 302.10 \n", - "99983 Pension and insurances 4751538620733305 4021524999937895 115.89 \n", - "99987 Pension and insurances 4405008355220324 4165276502284291 207.08 \n", - "99991 Pension and insurances 4092115788877543 4328901131757235 355.58 \n", - "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "\n", - " timestamp transaction_id \n", - "3 2024-02-07 07:55:08.158933 3 \n", - "7 2023-12-10 10:27:30.158933 7 \n", - "11 2024-01-17 03:59:02.158933 11 \n", - "15 2023-11-20 10:51:32.158933 15 \n", - "19 2023-11-06 07:33:38.158933 19 \n", - "... ... ... \n", - "99979 2024-01-09 23:29:38.158933 99979 \n", - "99983 2024-02-05 09:43:23.158933 99983 \n", - "99987 2023-12-14 17:17:11.158933 99987 \n", - "99991 2024-02-20 10:35:48.158933 99991 \n", - "99995 2023-12-15 15:05:56.158933 99995 \n", - "\n", - "[24999 rows x 6 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Keeping every second row\n", - "df_kept = data.iloc[::2]\n", - "\n", - "# Or, to explicitly remove every second row (the opposite selection)\n", - "df_removed = data.drop(data.index[::2])\n", - "\n", - "\n", - "# Keeping every second row\n", - "df_kept = df_removed.iloc[::2]\n", - "\n", - "# Or, to explicitly remove every second row (the opposite selection)\n", - "df_removed_v2 = df_removed.drop(df_removed.index[::2])\n", - "\n", - "df_removed_v2" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "06c03ea5-8394-44ff-b81d-755e1c244269", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamounttimestamptransaction_id
3Uncategorized45185519044999194457298962882528879.782024-04-05 09:12:27.2579453
7Uncategorized47579519156690804655296518888015801.222024-02-06 11:44:49.2579457
11Uncategorized45185519044999194910949333064003423.312024-03-15 05:16:21.25794511
15Uncategorized45185519044999194415760195692405382.732024-01-17 12:08:51.25794515
19Uncategorized40980889806929744412940106031926111.772024-01-03 08:50:57.25794519
.....................
99979Pension and insurances41796068600888494359198069543354302.102024-03-08 00:46:57.25794599979
99983Pension and insurances47515386207333054021524999937895115.892024-04-03 11:00:42.25794599983
99987Pension and insurances44050083552203244165276502284291207.082024-02-10 18:34:30.25794599987
99991Pension and insurances40921157888775434328901131757235355.582024-04-18 11:53:07.25794599991
99995Pension and insurances42620471944990064017367486513464204.262024-02-11 16:23:15.25794599995
\n", - "

24999 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "7 Uncategorized 4757951915669080 4655296518888015 801.22 \n", - "11 Uncategorized 4518551904499919 4910949333064003 423.31 \n", - "15 Uncategorized 4518551904499919 4415760195692405 382.73 \n", - "19 Uncategorized 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... ... \n", - "99979 Pension and insurances 4179606860088849 4359198069543354 302.10 \n", - "99983 Pension and insurances 4751538620733305 4021524999937895 115.89 \n", - "99987 Pension and insurances 4405008355220324 4165276502284291 207.08 \n", - "99991 Pension and insurances 4092115788877543 4328901131757235 355.58 \n", - "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "\n", - " timestamp transaction_id \n", - "3 2024-04-05 09:12:27.257945 3 \n", - "7 2024-02-06 11:44:49.257945 7 \n", - "11 2024-03-15 05:16:21.257945 11 \n", - "15 2024-01-17 12:08:51.257945 15 \n", - "19 2024-01-03 08:50:57.257945 19 \n", - "... ... ... \n", - "99979 2024-03-08 00:46:57.257945 99979 \n", - "99983 2024-04-03 11:00:42.257945 99983 \n", - "99987 2024-02-10 18:34:30.257945 99987 \n", - "99991 2024-04-18 11:53:07.257945 99991 \n", - "99995 2024-02-11 16:23:15.257945 99995 \n", - "\n", - "[24999 rows x 6 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from utils import update_timestamps\n", - "df_removed_v2=update_timestamps(df_removed_v2)\n", - "df_removed_v2" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_category
Uncategorized0879.78000045185519044999194457298962882528879.783202445912270.000000
Uncategorized0840.50000047579519156690804655296518888015801.22720242611444939.280000
Uncategorized0701.43666745185519044999194910949333064003423.3111202431551621278.126667
Uncategorized0621.76000045185519044999194415760195692405382.7315202411712851239.030000
Uncategorized0519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
Pension and insurances18211.45363641796068600888494359198069543354302.10999792024380465790.646364
Pension and insurances18211.10739147515386207333054021524999937895115.89999832024431104295.217391
Pension and insurances18211.09285244050083552203244165276502284291207.089998720242101834304.012852
Pension and insurances18211.61259040921157888775434328901131757235355.5899991202441811537143.967410
Pension and insurances18211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category_mapped amount_avg_1d \\\n", - "transaction_category \n", - "Uncategorized 0 879.780000 \n", - "Uncategorized 0 840.500000 \n", - "Uncategorized 0 701.436667 \n", - "Uncategorized 0 621.760000 \n", - "Uncategorized 0 519.762000 \n", - "... ... ... \n", - "Pension and insurances 18 211.453636 \n", - "Pension and insurances 18 211.107391 \n", - "Pension and insurances 18 211.092852 \n", - "Pension and insurances 18 211.612590 \n", - "Pension and insurances 18 211.586237 \n", - "\n", - " receiver_id sender_id amount \\\n", - "transaction_category \n", - "Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "Uncategorized 4757951915669080 4655296518888015 801.22 \n", - "Uncategorized 4518551904499919 4910949333064003 423.31 \n", - "Uncategorized 4518551904499919 4415760195692405 382.73 \n", - "Uncategorized 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... \n", - "Pension and insurances 4179606860088849 4359198069543354 302.10 \n", - "Pension and insurances 4751538620733305 4021524999937895 115.89 \n", - "Pension and insurances 4405008355220324 4165276502284291 207.08 \n", - "Pension and insurances 4092115788877543 4328901131757235 355.58 \n", - "Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "\n", - " transaction_id timestamp_year timestamp_month \\\n", - "transaction_category \n", - "Uncategorized 3 2024 4 \n", - "Uncategorized 7 2024 2 \n", - "Uncategorized 11 2024 3 \n", - "Uncategorized 15 2024 1 \n", - "Uncategorized 19 2024 1 \n", - "... ... ... ... \n", - "Pension and insurances 99979 2024 3 \n", - "Pension and insurances 99983 2024 4 \n", - "Pension and insurances 99987 2024 2 \n", - "Pension and insurances 99991 2024 4 \n", - "Pension and insurances 99995 2024 2 \n", - "\n", - " timestamp_day timestamp_hour timestamp_minute \\\n", - "transaction_category \n", - "Uncategorized 5 9 12 \n", - "Uncategorized 6 11 44 \n", - "Uncategorized 15 5 16 \n", - "Uncategorized 17 12 8 \n", - "Uncategorized 3 8 50 \n", - "... ... ... ... \n", - "Pension and insurances 8 0 46 \n", - "Pension and insurances 3 11 0 \n", - "Pension and insurances 10 18 34 \n", - "Pension and insurances 18 11 53 \n", - "Pension and insurances 11 16 23 \n", - "\n", - " timestamp_second distance \n", - "transaction_category \n", - "Uncategorized 27 0.000000 \n", - "Uncategorized 49 39.280000 \n", - "Uncategorized 21 278.126667 \n", - "Uncategorized 51 239.030000 \n", - "Uncategorized 57 407.992000 \n", - "... ... ... \n", - "Pension and insurances 57 90.646364 \n", - "Pension and insurances 42 95.217391 \n", - "Pension and insurances 30 4.012852 \n", - "Pension and insurances 7 143.967410 \n", - "Pension and insurances 15 7.326237 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ingested_data = extended_transactions_set.ingest(df_removed_v2, overwrite=True)\n", - "ingested_data" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_category
Uncategorized0879.78000045185519044999194457298962882528879.783202445912270.000000
Uncategorized0840.50000047579519156690804655296518888015801.22720242611444939.280000
Uncategorized0701.43666745185519044999194910949333064003423.3111202431551621278.126667
Uncategorized0621.76000045185519044999194415760195692405382.7315202411712851239.030000
Uncategorized0519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
Pension and insurances18211.45363641796068600888494359198069543354302.10999792024380465790.646364
Pension and insurances18211.10739147515386207333054021524999937895115.89999832024431104295.217391
Pension and insurances18211.09285244050083552203244165276502284291207.089998720242101834304.012852
Pension and insurances18211.61259040921157888775434328901131757235355.5899991202441811537143.967410
Pension and insurances18211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category_mapped amount_avg_1d \\\n", - "transaction_category \n", - "Uncategorized 0 879.780000 \n", - "Uncategorized 0 840.500000 \n", - "Uncategorized 0 701.436667 \n", - "Uncategorized 0 621.760000 \n", - "Uncategorized 0 519.762000 \n", - "... ... ... \n", - "Pension and insurances 18 211.453636 \n", - "Pension and insurances 18 211.107391 \n", - "Pension and insurances 18 211.092852 \n", - "Pension and insurances 18 211.612590 \n", - "Pension and insurances 18 211.586237 \n", - "\n", - " receiver_id sender_id amount \\\n", - "transaction_category \n", - "Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "Uncategorized 4757951915669080 4655296518888015 801.22 \n", - "Uncategorized 4518551904499919 4910949333064003 423.31 \n", - "Uncategorized 4518551904499919 4415760195692405 382.73 \n", - "Uncategorized 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... \n", - "Pension and insurances 4179606860088849 4359198069543354 302.10 \n", - "Pension and insurances 4751538620733305 4021524999937895 115.89 \n", - "Pension and insurances 4405008355220324 4165276502284291 207.08 \n", - "Pension and insurances 4092115788877543 4328901131757235 355.58 \n", - "Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "\n", - " transaction_id timestamp_year timestamp_month \\\n", - "transaction_category \n", - "Uncategorized 3 2024 4 \n", - "Uncategorized 7 2024 2 \n", - "Uncategorized 11 2024 3 \n", - "Uncategorized 15 2024 1 \n", - "Uncategorized 19 2024 1 \n", - "... ... ... ... \n", - "Pension and insurances 99979 2024 3 \n", - "Pension and insurances 99983 2024 4 \n", - "Pension and insurances 99987 2024 2 \n", - "Pension and insurances 99991 2024 4 \n", - "Pension and insurances 99995 2024 2 \n", - "\n", - " timestamp_day timestamp_hour timestamp_minute \\\n", - "transaction_category \n", - "Uncategorized 5 9 12 \n", - "Uncategorized 6 11 44 \n", - "Uncategorized 15 5 16 \n", - "Uncategorized 17 12 8 \n", - "Uncategorized 3 8 50 \n", - "... ... ... ... \n", - "Pension and insurances 8 0 46 \n", - "Pension and insurances 3 11 0 \n", - "Pension and insurances 10 18 34 \n", - "Pension and insurances 18 11 53 \n", - "Pension and insurances 11 16 23 \n", - "\n", - " timestamp_second distance \n", - "transaction_category \n", - "Uncategorized 27 0.000000 \n", - "Uncategorized 49 39.280000 \n", - "Uncategorized 21 278.126667 \n", - "Uncategorized 51 239.030000 \n", - "Uncategorized 57 407.992000 \n", - "... ... ... \n", - "Pension and insurances 57 90.646364 \n", - "Pension and insurances 42 95.217391 \n", - "Pension and insurances 30 4.012852 \n", - "Pension and insurances 7 143.967410 \n", - "Pension and insurances 15 7.326237 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#data = ingested_data.reset_index(drop=True)\n", - "data = ingested_data\n", - "#data = data[['transaction_category'] + [col for col in data.columns if col != 'transaction_category']]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", - "metadata": {}, - "outputs": [], - "source": [ - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "data_cols = list(data.columns)\n", - "# create feature vector on top of aggreagations\n", - "# Define the list of features we will be using\n", - "features = [f\"transactions.{name}\" for name in data_cols[1:]] \n", - "\n", - "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'transactions-vector'\n", - "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "transactions_fv = fstore.FeatureVector(fv_name, \n", - " features,\n", - " label_feature='transactions.transaction_category_mapped',\n", - " description='stocks information')\n", - "\n", - "#label_feature = 'transactions-v2.transaction_category',\n", - "# Save the feature vector in the Feature Store\n", - "transactions_fv.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "eb69d9fa-22a9-4b9f-9443-d00d9190ad55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
amount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistancetransaction_category_mapped
0879.78000045185519044999194457298962882528879.783202445912270.0000000
1840.50000047579519156690804655296518888015801.22720242611444939.2800000
2701.43666745185519044999194910949333064003423.3111202431551621278.1266670
3621.76000045185519044999194415760195692405382.7315202411712851239.0300000
4519.76200040980889806929744412940106031926111.771920241385057407.9920000
..........................................
24994211.45363641796068600888494359198069543354302.10999792024380465790.64636418
24995211.10739147515386207333054021524999937895115.89999832024431104295.21739118
24996211.09285244050083552203244165276502284291207.089998720242101834304.01285218
24997211.61259040921157888775434328901131757235355.5899991202441811537143.96741018
24998211.58623742620471944990064017367486513464204.269999520242111623157.32623718
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " amount_avg_1d receiver_id sender_id amount \\\n", - "0 879.780000 4518551904499919 4457298962882528 879.78 \n", - "1 840.500000 4757951915669080 4655296518888015 801.22 \n", - "2 701.436667 4518551904499919 4910949333064003 423.31 \n", - "3 621.760000 4518551904499919 4415760195692405 382.73 \n", - "4 519.762000 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... ... \n", - "24994 211.453636 4179606860088849 4359198069543354 302.10 \n", - "24995 211.107391 4751538620733305 4021524999937895 115.89 \n", - "24996 211.092852 4405008355220324 4165276502284291 207.08 \n", - "24997 211.612590 4092115788877543 4328901131757235 355.58 \n", - "24998 211.586237 4262047194499006 4017367486513464 204.26 \n", - "\n", - " transaction_id timestamp_year timestamp_month timestamp_day \\\n", - "0 3 2024 4 5 \n", - "1 7 2024 2 6 \n", - "2 11 2024 3 15 \n", - "3 15 2024 1 17 \n", - "4 19 2024 1 3 \n", - "... ... ... ... ... \n", - "24994 99979 2024 3 8 \n", - "24995 99983 2024 4 3 \n", - "24996 99987 2024 2 10 \n", - "24997 99991 2024 4 18 \n", - "24998 99995 2024 2 11 \n", - "\n", - " timestamp_hour timestamp_minute timestamp_second distance \\\n", - "0 9 12 27 0.000000 \n", - "1 11 44 49 39.280000 \n", - "2 5 16 21 278.126667 \n", - "3 12 8 51 239.030000 \n", - "4 8 50 57 407.992000 \n", - "... ... ... ... ... \n", - "24994 0 46 57 90.646364 \n", - "24995 11 0 42 95.217391 \n", - "24996 18 34 30 4.012852 \n", - "24997 11 53 7 143.967410 \n", - "24998 16 23 15 7.326237 \n", - "\n", - " transaction_category_mapped \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "... ... \n", - "24994 18 \n", - "24995 18 \n", - "24996 18 \n", - "24997 18 \n", - "24998 18 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fs\n", - "resp = transactions_fv.get_offline_features()\n", - "#Preview the dataset\n", - "fv_data = resp.to_dataframe()\n", - "fv_data" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "cb156ebe-9846-4ff3-a388-92362df7c741", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'amount_avg_1d': 211.58623655913973,\n", - " 'receiver_id': 4262047194499006,\n", - " 'sender_id': 4017367486513464,\n", - " 'amount': 204.26,\n", - " 'transaction_id': 99995,\n", - " 'timestamp_year': 2024,\n", - " 'timestamp_month': 2,\n", - " 'timestamp_day': 11,\n", - " 'timestamp_hour': 16,\n", - " 'timestamp_minute': 23,\n", - " 'timestamp_second': 15,\n", - " 'distance': 7.326236559139744}]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "svc = transactions_fv.get_online_feature_service()\n", - "resp = svc.get([{\"transaction_category\": \"Pension and insurances\"}])\n", - "resp" - ] - }, - { - "cell_type": "markdown", - "id": "b5e4834e", - "metadata": {}, - "source": [ - "We update the values in the feature store with the real values of our data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" - ] - }, - { - "cell_type": "markdown", - "id": "cf148985", - "metadata": {}, - "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" - ] - }, - { - "cell_type": "markdown", - "id": "289eeca6", - "metadata": {}, - "source": [ - "### 4. Create model \n", - "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", - "\n", - "\n", - "\n", - "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "1cbb00b5-46bf-4a20-aad9-a03716ab97ae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
00879.78000045185519044999194457298962882528879.783202445912270.000000
10840.50000047579519156690804655296518888015801.22720242611444939.280000
20701.43666745185519044999194910949333064003423.3111202431551621278.126667
30621.76000045185519044999194415760195692405382.7315202411712851239.030000
40519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
2499418211.45363641796068600888494359198069543354302.10999792024380465790.646364
2499518211.10739147515386207333054021524999937895115.89999832024431104295.217391
2499618211.09285244050083552203244165276502284291207.089998720242101834304.012852
2499718211.61259040921157888775434328901131757235355.5899991202441811537143.967410
2499818211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category_mapped amount_avg_1d receiver_id \\\n", - "0 0 879.780000 4518551904499919 \n", - "1 0 840.500000 4757951915669080 \n", - "2 0 701.436667 4518551904499919 \n", - "3 0 621.760000 4518551904499919 \n", - "4 0 519.762000 4098088980692974 \n", - "... ... ... ... \n", - "24994 18 211.453636 4179606860088849 \n", - "24995 18 211.107391 4751538620733305 \n", - "24996 18 211.092852 4405008355220324 \n", - "24997 18 211.612590 4092115788877543 \n", - "24998 18 211.586237 4262047194499006 \n", - "\n", - " sender_id amount transaction_id timestamp_year \\\n", - "0 4457298962882528 879.78 3 2024 \n", - "1 4655296518888015 801.22 7 2024 \n", - "2 4910949333064003 423.31 11 2024 \n", - "3 4415760195692405 382.73 15 2024 \n", - "4 4412940106031926 111.77 19 2024 \n", - "... ... ... ... ... \n", - "24994 4359198069543354 302.10 99979 2024 \n", - "24995 4021524999937895 115.89 99983 2024 \n", - "24996 4165276502284291 207.08 99987 2024 \n", - "24997 4328901131757235 355.58 99991 2024 \n", - "24998 4017367486513464 204.26 99995 2024 \n", - "\n", - " timestamp_month timestamp_day timestamp_hour timestamp_minute \\\n", - "0 4 5 9 12 \n", - "1 2 6 11 44 \n", - "2 3 15 5 16 \n", - "3 1 17 12 8 \n", - "4 1 3 8 50 \n", - "... ... ... ... ... \n", - "24994 3 8 0 46 \n", - "24995 4 3 11 0 \n", - "24996 2 10 18 34 \n", - "24997 4 18 11 53 \n", - "24998 2 11 16 23 \n", - "\n", - " timestamp_second distance \n", - "0 27 0.000000 \n", - "1 49 39.280000 \n", - "2 21 278.126667 \n", - "3 51 239.030000 \n", - "4 57 407.992000 \n", - "... ... ... \n", - "24994 57 90.646364 \n", - "24995 42 95.217391 \n", - "24996 30 4.012852 \n", - "24997 7 143.967410 \n", - "24998 15 7.326237 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fs\n", - "resp = transactions_fv.get_offline_features()\n", - "#Preview the dataset\n", - "fv_data = resp.to_dataframe()\n", - "\n", - "column_to_move = 'transaction_category_mapped'\n", - "\n", - "new_columns_order = [column_to_move] + [col for col in fv_data.columns if col != column_to_move]\n", - "fv_data = fv_data[new_columns_order]\n", - "\n", - "\n", - "data = fv_data\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "47512de3-60ac-49c7-ace8-031959527e86", - "metadata": {}, - "outputs": [], - "source": [ - "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", - "train_data, validation_data, test_data = np.split(\n", - " fv_data.sample(frac=1, random_state=42), [int(0.7 * len(fv_data)), int(0.9 * len(fv_data))]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f81f65b9", - "metadata": {}, - "source": [ - "We save these sets to a file." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "f849a7a9", - "metadata": {}, - "outputs": [], - "source": [ - "train_data.to_csv(\"train.csv\", index=False, header=False)\n", - "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", - "test_data.to_csv(\"test.csv\", index=False, header=True)" - ] - }, - { - "cell_type": "markdown", - "id": "de669936", - "metadata": {}, - "source": [ - "And upload these files to our s3 bucket" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "e1ca2543", - "metadata": {}, - "outputs": [], - "source": [ - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"train/train.csv\")\n", - ").upload_file(\"train.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", - ").upload_file(\"validation.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"test/test.csv\")\n", - ").upload_file(\"test.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "22de532f", - "metadata": {}, - "source": [ - "Get the XGBoost sagemaker image" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "a41b6a7d", - "metadata": {}, - "outputs": [], - "source": [ - "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" - ] - }, - { - "cell_type": "markdown", - "id": "66cae2a9", - "metadata": {}, - "source": [ - "Transform our data to a sagemaker input for training" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "e51c917a", - "metadata": {}, - "outputs": [], - "source": [ - "s3_input_train = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")\n", - "s3_input_validation = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6f2985d8", - "metadata": {}, - "source": [ - "We define the XGBoost model" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "92c1fe8c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb = sagemaker.estimator.Estimator(\n", - " container,\n", - " role,\n", - " instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", - " sagemaker_session=sagemaker_session,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ecafdfe8", - "metadata": {}, - "source": [ - "Set the parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "582adc6c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb.set_hyperparameters(\n", - " max_depth=5,\n", - " eta=0.2,\n", - " gamma=4,\n", - " min_child_weight=6,\n", - " subsample=0.8,\n", - " objective=\"multi:softprob\",\n", - " num_class=19,\n", - " verbosity=0,\n", - " num_round=100,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b36463dd", - "metadata": {}, - "source": [ - "And train the model" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "c24e06fc", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-16-32-32-584\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-02-11 16:32:32 Starting - Starting the training job...\n", - "2024-02-11 16:32:46 Starting - Preparing the instances for training......\n", - "2024-02-11 16:33:57 Downloading - Downloading input data......\n", - "2024-02-11 16:34:37 Downloading - Downloading the training image...\n", - "2024-02-11 16:35:23 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-11 16:35:36.980 ip-10-0-69-198.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Train matrix has 17499 rows and 12 columns\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Validation matrix has 5000 rows\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.099 ip-10-0-69-198.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.100 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.101 ip-10-0-69-198.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.00166#011validation-merror:0.00380\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.268 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.271 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.00126#011validation-merror:0.00320\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.00103#011validation-merror:0.00260\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.00046#011validation-merror:0.00180\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.00029#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.00040#011validation-merror:0.00160\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.00017#011validation-merror:0.00100\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\n", - "2024-02-11 16:35:53 Uploading - Uploading generated training model\n", - "2024-02-11 16:36:04 Completed - Training job completed\n", - "Training seconds: 127\n", - "Billable seconds: 127\n" - ] - } - ], - "source": [ - "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" - ] - }, - { - "cell_type": "markdown", - "id": "8b716cd7", - "metadata": {}, - "source": [ - "### 5. Using the endpoint \n", - "\n", - "Deploy the model to an endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz'" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "xgb.model_data" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "78444d49-4ad3-49e4-a579-19b173facb26", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function = project.get_function(\"serving\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "911457fa-812d-4991-a31c-4dfcb1593d3e", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function_v2 = project.set_function(\n", - " func=\"src/functions/serving.py\",\n", - " name=\"serving-v2\",\n", - " kind=\"serving\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2881c17d-dd84-43d7-acc7-83e40c8110d3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_start->\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "graph = serving_function_v2.set_topology(\n", - " \"router\",\n", - " mlrun.serving.routers.EnrichmentModelRouter(\n", - " feature_vector_uri=transactions_fv.uri,\n", - " impute_policy={\"*\": \"$mean\"}),\n", - ")\n", - "serving_function_v2.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=xgb.model_data)\n", - "\n", - "# Plot the ensemble configuration\n", - "serving_function_v2.spec.graph.plot()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "0ab0bcd2-5c70-4f48-bff9-d060f027e8e5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:36:45,242 [info] model xgboost-model was loaded\n", - "> 2024-02-11 16:36:45,243 [info] Loaded ['xgboost-model']\n" - ] - } - ], - "source": [ - "server = serving_function_v2.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "dd57cfcd-5878-4775-83ee-422dc2261ce8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'inputs': [[211.58623655913973, 4262047194499006, 4017367486513464, 204.26, 99995, 2024, 2, 11, 16, 23, 15, 7.326236559139744]]}\n" - ] - }, - { - "data": { - "text/plain": [ - "{'id': '9fca777838b34ecaa9a0978beb4c3324',\n", - " 'model_name': 'xgboost-model',\n", - " 'outputs': [[0.0006098453304730356,\n", - " 0.000491024402435869,\n", - " 0.0005141795263625681,\n", - " 0.0007783450419083238,\n", - " 0.0007057395414449275,\n", - " 0.0006167895044200122,\n", - " 0.0008293685968965292,\n", - " 0.0007642377750016749,\n", - " 0.0004749966901727021,\n", - " 0.0009146890370175242,\n", - " 0.0023798206821084023,\n", - " 0.0007584734121337533,\n", - " 0.0005588593194261193,\n", - " 0.0018726944690570235,\n", - " 0.001147600938566029,\n", - " 0.0010383735643699765,\n", - " 0.0010812224354594946,\n", - " 0.006694969721138477,\n", - " 0.9777687788009644]]}" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "response = server.test(body={'inputs':[['Pension and insurances']]})\n", - "response" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:39:18,386 [info] Starting remote function deploy\n", - "2024-02-11 16:39:18 (info) Deploying function\n", - "2024-02-11 16:39:18 (info) Building\n", - "2024-02-11 16:39:19 (info) Staging files and preparing base images\n", - "2024-02-11 16:39:19 (info) Building processor image\n", - "2024-02-11 16:40:24 (info) Build complete\n", - "2024-02-11 16:40:33 (info) Function deploy complete\n", - "> 2024-02-11 16:40:40,471 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-v3-admin-serving-v2.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" - ] - }, - { - "data": { - "text/plain": [ - "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-v3-admin-serving-v2'})" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "project.deploy_function(\"serving-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "ac19dc03-01e2-4e29-ba75-a34804833d5c", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function_v2 = project.get_function(\"serving-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:40:40,546 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//v2/models/xgboost-model/predict'}\n" - ] - } - ], - "source": [ - "response = serving_function_v2.invoke(path='/v2/models/xgboost-model/predict', body={\"inputs\": [['Pension and insurances']]})" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "57eeaddc-654a-41d2-bb51-4a9a787a3311", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '641c5971-c881-4d56-a326-8b02900be8db',\n", - " 'model_name': 'xgboost-model',\n", - " 'outputs': [[0.0006098453304730356,\n", - " 0.000491024402435869,\n", - " 0.0005141795263625681,\n", - " 0.0007783450419083238,\n", - " 0.0007057395414449275,\n", - " 0.0006167895044200122,\n", - " 0.0008293685968965292,\n", - " 0.0007642377750016749,\n", - " 0.0004749966901727021,\n", - " 0.0009146890370175242,\n", - " 0.0023798206821084023,\n", - " 0.0007584734121337533,\n", - " 0.0005588593194261193,\n", - " 0.0018726944690570235,\n", - " 0.001147600938566029,\n", - " 0.0010383735643699765,\n", - " 0.0010812224354594946,\n", - " 0.006694969721138477,\n", - " 0.9777687788009644]]}" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "response" - ] - }, - { - "cell_type": "markdown", - "id": "712f4d35", - "metadata": {}, - "source": [ - "### 6. Evaluate performance \n", - "\n", - "Run the model on our test data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ff008b-f4e4-491b-b1e8-3b0a652c35fc", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "2e863ea7-5804-4637-b677-390c305cabfe", - "metadata": {}, - "outputs": [], - "source": [ - "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", - "metadata": {}, - "source": [ - "Add the evaluation function to our project" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "ca4f7e49", - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_function = project.get_function(\"evaluate\")" - ] - }, - { - "cell_type": "markdown", - "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", - "metadata": {}, - "source": [ - "Run the evaluation job" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:40:40,769 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '77fb208c0816491e9f7ae69634b31b1b', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", - "> 2024-02-11 16:40:41,204 [info] Job is running in the background, pod: evaluate-evaluate-x5mzk\n", - "[16:40:45] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", - "configuration generated by an older version of XGBoost, please export the model by calling\n", - "`Booster.save_model` from that version first, then load it back in current version. See:\n", - "\n", - " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", - "\n", - "for more details about differences between saving model and serializing.\n", - "\n", - "> 2024-02-11 16:40:46,110 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin', 'logs_cmd': 'mlrun logs 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin'}\n", - "> 2024-02-11 16:40:46,110 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-v3-admin/jobs/monitor/77fb208c0816491e9f7ae69634b31b1b/overview'}\n", - "> 2024-02-11 16:40:46,110 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-v3-admin0Feb 11 16:40:44completedevaluate-evaluate
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-x5mzk
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category_mapped
factorize_key={'Uncategorized': 0, 'Entertainment': 1, 'Education': 2, 'Shopping': 3, 'Personal Care': 4, 'Health and Fitness': 5, 'Food and Dining': 6, 'Gifts and Donations': 7, 'Investments': 8, 'Bills and Utilities': 9, 'Auto and Transport': 10, 'Travel': 11, 'Fees and Charges': 12, 'Business Services': 13, 'Personal Services': 14, 'Taxes': 15, 'Gambling': 16, 'Home': 17, 'Pension and insurances': 18}
classification_report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:40:54,323 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" - ] - } - ], - "source": [ - "evaluate_run = evaluate_function.run(\n", - " handler=\"evaluate\",\n", - " params={\n", - " \"model_path\": xgb.model_data,\n", - " \"model_name\": \"xgboost-model\",\n", - " \"test_set\": s3_data,\n", - " \"label_column\": \"transaction_category_mapped\",\n", - " \"factorize_key\": factorize_key,\n", - " },\n", - " returns=[\"classification_report: dataset\"])" - ] - }, - { - "cell_type": "markdown", - "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", - "metadata": {}, - "source": [ - "See the evaluation result" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
precisionrecallf1-scoresupport
Uncategorized1.0000001.0000001.0000008.0000
Entertainment1.0000001.0000001.000000362.0000
Education1.0000000.9285710.96296314.0000
Shopping0.9988391.0000000.999419860.0000
Personal Care1.0000001.0000001.00000023.0000
Health and Fitness1.0000001.0000001.000000123.0000
Food and Dining1.0000001.0000001.000000217.0000
Gifts and Donations1.0000001.0000001.00000068.0000
Investments0.9166671.0000000.95652222.0000
Bills and Utilities1.0000001.0000001.00000092.0000
Auto and Transport1.0000001.0000001.000000475.0000
Travel1.0000001.0000001.00000036.0000
Fees and Charges1.0000001.0000001.00000029.0000
Business Services1.0000001.0000001.00000047.0000
Personal Services1.0000001.0000001.00000024.0000
Taxes1.0000000.8461540.91666713.0000
Gambling1.0000001.0000001.0000008.0000
Home1.0000001.0000001.00000052.0000
Pension and insurances1.0000001.0000001.00000027.0000
accuracy0.9988000.9988000.9988000.9988
macro avg0.9955530.9881430.9913462500.0000
weighted avg0.9988670.9988000.9987772500.0000
\n", - "
" - ], - "text/plain": [ - " precision recall f1-score support\n", - "Uncategorized 1.000000 1.000000 1.000000 8.0000\n", - "Entertainment 1.000000 1.000000 1.000000 362.0000\n", - "Education 1.000000 0.928571 0.962963 14.0000\n", - "Shopping 0.998839 1.000000 0.999419 860.0000\n", - "Personal Care 1.000000 1.000000 1.000000 23.0000\n", - "Health and Fitness 1.000000 1.000000 1.000000 123.0000\n", - "Food and Dining 1.000000 1.000000 1.000000 217.0000\n", - "Gifts and Donations 1.000000 1.000000 1.000000 68.0000\n", - "Investments 0.916667 1.000000 0.956522 22.0000\n", - "Bills and Utilities 1.000000 1.000000 1.000000 92.0000\n", - "Auto and Transport 1.000000 1.000000 1.000000 475.0000\n", - "Travel 1.000000 1.000000 1.000000 36.0000\n", - "Fees and Charges 1.000000 1.000000 1.000000 29.0000\n", - "Business Services 1.000000 1.000000 1.000000 47.0000\n", - "Personal Services 1.000000 1.000000 1.000000 24.0000\n", - "Taxes 1.000000 0.846154 0.916667 13.0000\n", - "Gambling 1.000000 1.000000 1.000000 8.0000\n", - "Home 1.000000 1.000000 1.000000 52.0000\n", - "Pension and insurances 1.000000 1.000000 1.000000 27.0000\n", - "accuracy 0.998800 0.998800 0.998800 0.9988\n", - "macro avg 0.995553 0.988143 0.991346 2500.0000\n", - "weighted avg 0.998867 0.998800 0.998777 2500.0000" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluate_run.artifact(\"classification_report\").as_df()" - ] - }, - { - "cell_type": "markdown", - "id": "98d0b67e", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "You should see results similar to this:\n", - "\n", - "```\n", - " precision recall f1-score support\n", - "\n", - " Uncategorized 1.00 0.92 0.96 51\n", - " Entertainment 0.81 0.89 0.85 1486\n", - " Education 1.00 0.94 0.97 80\n", - " Shopping 0.86 0.94 0.90 3441\n", - " Personal Care 1.00 0.98 0.99 132\n", - " Health and Fitness 0.99 0.89 0.94 443\n", - " Food and Dining 0.99 0.82 0.90 918\n", - " Gifts and Donations 1.00 0.95 0.97 275\n", - " Investments 0.99 0.97 0.98 88\n", - " Bills and Utilities 1.00 0.99 1.00 332\n", - " Auto and Transport 0.94 0.84 0.88 1967\n", - " Travel 0.96 0.84 0.90 120\n", - " Fees and Charges 1.00 0.94 0.97 106\n", - " Business Services 1.00 0.99 1.00 146\n", - " Personal Services 1.00 0.96 0.98 75\n", - " Taxes 0.98 0.94 0.96 47\n", - " Gambling 1.00 1.00 1.00 15\n", - " Home 0.98 0.89 0.93 168\n", - "Pension and insurances 0.99 1.00 1.00 110\n", - "\n", - " accuracy 0.90 10000\n", - " macro avg 0.97 0.93 0.95 10000\n", - " weighted avg 0.91 0.90 0.90 10000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "49fdc82d", - "metadata": {}, - "source": [ - "### 7. Clean up \n", - "\n", - "Remove the feature group and endpoint to clean up" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f79b1164", - "metadata": {}, - "outputs": [], - "source": [ - "#feature_group.delete()\n", - "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" - ] - }, - { - "cell_type": "markdown", - "id": "e04b6fa6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Notebook CI Test Results\n", - "\n", - "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", - "\n", - "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "smdemo", - "language": "python", - "name": "smdemo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/financial_payment_classification_with_serving.ipynb b/financial_payment_classification_with_serving.ipynb new file mode 100644 index 0000000..31eb690 --- /dev/null +++ b/financial_payment_classification_with_serving.ipynb @@ -0,0 +1,2137 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01b5c703", + "metadata": {}, + "source": [ + "# SageMaker Payment Classification \n" + ] + }, + { + "cell_type": "markdown", + "id": "6498f087", + "metadata": {}, + "source": [ + "---\n", + "\n", + "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", + "\n", + "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "c2e49281", + "metadata": {}, + "source": [ + "\n", + "## Background \n", + "\n", + "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", + "\n", + "\n", + "## Notebook overview \n", + "\n", + "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", + "\n", + "## Dataset \n", + "\n", + "For this notebook we use a synthetic dataset. This dataset has the following features \n", + "\n", + "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", + "\n", + " 'Uncategorized', 'Entertainment', 'Education',\n", + " 'Shopping', 'Personal Care', 'Health and Fitness',\n", + " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", + " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", + " 'Fees and Charges', 'Business Services', 'Personal Services',\n", + " 'Taxes', 'Gambling', 'Home',\n", + " 'Pension and insurances'\n", + "\n", + "\n", + "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", + "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", + "* __amount__: the amount which is transferred.\n", + "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", + "\n", + "\n", + "### 1. Setup \n", + "\n", + "Before we start we need to update the sagemaker library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fff19d6b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import sys\n", + "!{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", + "!{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", + "!{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-01-25 14:10:06,832 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" + ] + } + ], + "source": [ + "project = mlrun.get_or_create_project(\n", + " name=\"sagemaker\", \n", + " user_project=True,\n", + " parameters={\n", + " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", + " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b17a94d", + "metadata": {}, + "source": [ + "Now that we have the latest version we can import the libraries that we'll use in this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "42c5d6d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /User/.config/sagemaker/config.yaml\n" + ] + } + ], + "source": [ + "import boto3\n", + "import io\n", + "import sagemaker\n", + "import time\n", + "import os\n", + "from time import sleep\n", + "from sklearn.metrics import classification_report\n", + "from sagemaker.feature_store.feature_group import FeatureGroup\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", + "metadata": {}, + "outputs": [], + "source": [ + "sess = sagemaker.Session()\n", + "write_bucket = sess.default_bucket()\n", + "write_prefix = \"sagemaker-app-lab\"" + ] + }, + { + "cell_type": "markdown", + "id": "3af7c33d", + "metadata": {}, + "source": [ + "Let's set the session variables to ensure that SageMaker is configured correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c0e4db17", + "metadata": {}, + "outputs": [], + "source": [ + "region = sagemaker.Session().boto_region_name\n", + "sm_client = boto3.client(\"sagemaker\")\n", + "boto_session = boto3.Session(region_name=region)\n", + "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", + "#role = sagemaker.get_execution_role()\n", + "role = sagemaker_role\n", + "bucket_prefix = \"payment-classification\"\n", + "s3_bucket = sagemaker_session.default_bucket()" + ] + }, + { + "cell_type": "markdown", + "id": "4fe6a975", + "metadata": {}, + "source": [ + "We define the factorize key which is used to map the '__transaction_category__' to numeric values" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "43946b9f", + "metadata": {}, + "outputs": [], + "source": [ + "factorize_key = {\n", + " \"Uncategorized\": 0,\n", + " \"Entertainment\": 1,\n", + " \"Education\": 2,\n", + " \"Shopping\": 3,\n", + " \"Personal Care\": 4,\n", + " \"Health and Fitness\": 5,\n", + " \"Food and Dining\": 6,\n", + " \"Gifts and Donations\": 7,\n", + " \"Investments\": 8,\n", + " \"Bills and Utilities\": 9,\n", + " \"Auto and Transport\": 10,\n", + " \"Travel\": 11,\n", + " \"Fees and Charges\": 12,\n", + " \"Business Services\": 13,\n", + " \"Personal Services\": 14,\n", + " \"Taxes\": 15,\n", + " \"Gambling\": 16,\n", + " \"Home\": 17,\n", + " \"Pension and insurances\": 18,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5e3dc3c4", + "metadata": {}, + "source": [ + "### 2. Data preparation \n", + "\n", + "We ingest the simulated data from the public SageMaker S3 training database:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5ff0d280", + "metadata": {}, + "outputs": [], + "source": [ + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " f\"sagemaker-example-files-prod-{region}\",\n", + " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", + " \"financial_transactions_mini.csv\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "08578d93", + "metadata": {}, + "source": [ + "Let's start by loading the dataset from our csv file into a Pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a477abd7", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\n", + " \"financial_transactions_mini.csv\",\n", + " parse_dates=[\"timestamp\"],\n", + " infer_datetime_format=True,\n", + " dtype={\"transaction_category\": \"string\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cf6be447", + "metadata": {}, + "source": [ + "The dataframe looks as follows:\n", + "\n", + "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", + "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", + "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", + "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", + "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", + "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", + "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", + "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", + "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", + "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", + "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", + "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" + ] + }, + { + "cell_type": "markdown", + "id": "b5492919", + "metadata": {}, + "source": [ + "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "24f6090e", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"year\"] = data[\"timestamp\"].dt.year\n", + "data[\"month\"] = data[\"timestamp\"].dt.month\n", + "data[\"day\"] = data[\"timestamp\"].dt.day\n", + "data[\"hour\"] = data[\"timestamp\"].dt.hour\n", + "data[\"minute\"] = data[\"timestamp\"].dt.minute\n", + "data[\"second\"] = data[\"timestamp\"].dt.second\n", + "\n", + "del data[\"timestamp\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", + "metadata": {}, + "outputs": [], + "source": [ + "for key, val in factorize_key.items():\n", + " factorize_key[key] = str(val)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "[ 'Uncategorized', 'Entertainment', 'Education',\n", + " 'Shopping', 'Personal Care', 'Health and Fitness',\n", + " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", + " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", + " 'Fees and Charges', 'Business Services', 'Personal Services',\n", + " 'Taxes', 'Gambling', 'Home',\n", + " 'Pension and insurances']\n", + "Length: 19, dtype: string" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"transaction_category\"].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "f7314f8a", + "metadata": {}, + "source": [ + "We'll transform the transaction categories to numeric targets for the classification by factorization." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ea2ebdd5", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" + ] + }, + { + "cell_type": "markdown", + "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", + "metadata": { + "tags": [] + }, + "source": [ + "### 3. Create feature store \n", + "\n", + "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " + ] + }, + { + "cell_type": "markdown", + "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", + "metadata": {}, + "source": [ + "#### feature-group-payment-classification" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3c621044-681a-4e1a-9968-f637ed992539", + "metadata": {}, + "outputs": [], + "source": [ + "def add_grouped_features(df):\n", + " feature_store_data = pd.DataFrame()\n", + " feature_store_data[\"mean_amount\"] = df.groupby([\"transaction_category\"]).mean()[\"amount\"]\n", + " feature_store_data[\"count\"] = df.groupby([\"transaction_category\"]).count()[\"amount\"]\n", + " feature_store_data[\"identifier\"] = feature_store_data.index\n", + " feature_store_data[\"EventTime\"] = time.time()\n", + " \n", + " \n", + " \n", + " additional_features = pd.pivot_table(\n", + " feature_store_data, values=[\"mean_amount\"], index=[\"identifier\"]\n", + " ).T.add_prefix(\"dist_\")\n", + " additional_features_columns = list(additional_features.columns)\n", + " df2 = df.copy()\n", + " df2 = pd.concat([df2, pd.DataFrame(columns=additional_features_columns, dtype=object)])\n", + " df2[additional_features_columns] = additional_features.values[0]\n", + " for col in additional_features_columns:\n", + " df2[col] = abs(df2[col] - df2[\"amount\"]) \n", + " df2['transaction_id']= df2.reset_index().index \n", + " return df2" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c71af4a9-f2d8-40ca-b0bf-3ef67c5b69d9", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "add_grouped_features\n", + "\n", + "add_grouped_features\n", + "\n", + "\n", + "\n", + "_start->add_grouped_features\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "parquet/parquet\n", + "\n", + "\n", + "parquet\n", + "\n", + "\n", + "\n", + "add_grouped_features->parquet/parquet\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "nosql/nosql\n", + "\n", + "\n", + "nosql\n", + "\n", + "\n", + "\n", + "add_grouped_features->nosql/nosql\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fstore\n", + "from mlrun.datastore.targets import ParquetTarget\n", + "\n", + "# creating feature set\n", + "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", + " entities=[fstore.Entity(\"transaction_id\")],\n", + " engine=\"pandas\",\n", + " description=\"transactions feature set\")\n", + "\n", + "# setting up the graph\n", + "extended_transactions_set.graph \\\n", + " .to(name='add_grouped_features', handler='add_grouped_features')\n", + " # Add aggregations for 2, 12, and 24 hour time windows\n", + " \n", + " \n", + "\n", + "\n", + "extended_transactions_set.set_targets()\n", + "\n", + "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-01-25 14:11:30,483 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
transaction_categoryreceiver_idsender_idamountyearmonthdayhourminutesecond...dist_18dist_2dist_3dist_4dist_5dist_6dist_7dist_8dist_9transaction_id
004.518552e+154.333582e+15833.262021.03.010.019.057.042.0...627.80284917.893495732.342497801.755964713.663595740.010607782.1875535191.287484718.4804420
104.518552e+154.642413e+15596.632021.02.011.017.053.032.0...391.172849254.523495495.712497565.125964477.033595503.380607545.5575535427.917484481.8504421
204.274544e+154.952666e+15176.762021.02.021.018.029.032.0...28.697151674.39349575.842497145.25596457.16359583.510607125.6875535847.78748461.9804422
304.518552e+154.457299e+15879.782021.04.09.016.014.019.0...674.32284928.626505778.862497848.275964760.183595786.530607828.7075535144.767484765.0004423
404.601853e+154.578126e+15742.252021.04.04.015.050.016.0...536.792849108.903495641.332497710.745964622.653595649.000607691.1775535282.297484627.4704424
..................................................................
99992184.405008e+154.583356e+15205.432021.04.020.012.023.053.0...0.027151645.723495104.512497173.92596485.833595112.180607154.3575535819.11748490.65044299992
99993184.300417e+154.949241e+15151.492021.03.024.019.030.018.0...53.967151699.66349550.572497119.98596431.89359558.240607100.4175535873.05748436.71044299993
99994184.405008e+154.996896e+15188.282021.03.08.019.051.010.0...17.177151662.87349587.362497156.77596468.68359595.030607137.2075535836.26748473.50044299994
99995184.262047e+154.017367e+15204.262021.02.014.023.025.07.0...1.197151646.893495103.342497172.75596484.663595111.010607153.1875535820.28748489.48044299995
99996184.627517e+154.250421e+15207.922021.04.014.00.042.00.0...2.462849643.233495107.002497176.41596488.323595114.670607156.8475535816.62748493.14044299996
\n", + "

99997 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " transaction_category receiver_id sender_id amount year month \\\n", + "0 0 4.518552e+15 4.333582e+15 833.26 2021.0 3.0 \n", + "1 0 4.518552e+15 4.642413e+15 596.63 2021.0 2.0 \n", + "2 0 4.274544e+15 4.952666e+15 176.76 2021.0 2.0 \n", + "3 0 4.518552e+15 4.457299e+15 879.78 2021.0 4.0 \n", + "4 0 4.601853e+15 4.578126e+15 742.25 2021.0 4.0 \n", + "... ... ... ... ... ... ... \n", + "99992 18 4.405008e+15 4.583356e+15 205.43 2021.0 4.0 \n", + "99993 18 4.300417e+15 4.949241e+15 151.49 2021.0 3.0 \n", + "99994 18 4.405008e+15 4.996896e+15 188.28 2021.0 3.0 \n", + "99995 18 4.262047e+15 4.017367e+15 204.26 2021.0 2.0 \n", + "99996 18 4.627517e+15 4.250421e+15 207.92 2021.0 4.0 \n", + "\n", + " day hour minute second ... dist_18 dist_2 dist_3 \\\n", + "0 10.0 19.0 57.0 42.0 ... 627.802849 17.893495 732.342497 \n", + "1 11.0 17.0 53.0 32.0 ... 391.172849 254.523495 495.712497 \n", + "2 21.0 18.0 29.0 32.0 ... 28.697151 674.393495 75.842497 \n", + "3 9.0 16.0 14.0 19.0 ... 674.322849 28.626505 778.862497 \n", + "4 4.0 15.0 50.0 16.0 ... 536.792849 108.903495 641.332497 \n", + "... ... ... ... ... ... ... ... ... \n", + "99992 20.0 12.0 23.0 53.0 ... 0.027151 645.723495 104.512497 \n", + "99993 24.0 19.0 30.0 18.0 ... 53.967151 699.663495 50.572497 \n", + "99994 8.0 19.0 51.0 10.0 ... 17.177151 662.873495 87.362497 \n", + "99995 14.0 23.0 25.0 7.0 ... 1.197151 646.893495 103.342497 \n", + "99996 14.0 0.0 42.0 0.0 ... 2.462849 643.233495 107.002497 \n", + "\n", + " dist_4 dist_5 dist_6 dist_7 dist_8 \\\n", + "0 801.755964 713.663595 740.010607 782.187553 5191.287484 \n", + "1 565.125964 477.033595 503.380607 545.557553 5427.917484 \n", + "2 145.255964 57.163595 83.510607 125.687553 5847.787484 \n", + "3 848.275964 760.183595 786.530607 828.707553 5144.767484 \n", + "4 710.745964 622.653595 649.000607 691.177553 5282.297484 \n", + "... ... ... ... ... ... \n", + "99992 173.925964 85.833595 112.180607 154.357553 5819.117484 \n", + "99993 119.985964 31.893595 58.240607 100.417553 5873.057484 \n", + "99994 156.775964 68.683595 95.030607 137.207553 5836.267484 \n", + "99995 172.755964 84.663595 111.010607 153.187553 5820.287484 \n", + "99996 176.415964 88.323595 114.670607 156.847553 5816.627484 \n", + "\n", + " dist_9 transaction_id \n", + "0 718.480442 0 \n", + "1 481.850442 1 \n", + "2 61.980442 2 \n", + "3 765.000442 3 \n", + "4 627.470442 4 \n", + "... ... ... \n", + "99992 90.650442 99992 \n", + "99993 36.710442 99993 \n", + "99994 73.500442 99994 \n", + "99995 89.480442 99995 \n", + "99996 93.140442 99996 \n", + "\n", + "[99997 rows x 30 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import mlrun.feature_store as fstore\n", + "data = extended_transactions_set.ingest(data, overwrite=True)\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "e2f6395f", + "metadata": {}, + "source": [ + "And display them after getting them from the feature store" + ] + }, + { + "cell_type": "markdown", + "id": "cf148985", + "metadata": {}, + "source": [ + "We use the feature store to calculate the distance between the average of every category and the current amount" + ] + }, + { + "cell_type": "markdown", + "id": "289eeca6", + "metadata": {}, + "source": [ + "### 4. Create model \n", + "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", + "\n", + "\n", + "\n", + "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bb4bdd8d", + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", + "train_data, validation_data, test_data = np.split(\n", + " data.sample(frac=1, random_state=42), [int(0.7 * len(data)), int(0.9 * len(data))]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f81f65b9", + "metadata": {}, + "source": [ + "We save these sets to a file." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f849a7a9", + "metadata": {}, + "outputs": [], + "source": [ + "train_data.to_csv(\"train.csv\", index=False, header=False)\n", + "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", + "test_data.to_csv(\"test.csv\", index=False, header=True)" + ] + }, + { + "cell_type": "markdown", + "id": "de669936", + "metadata": {}, + "source": [ + "And upload these files to our s3 bucket" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e1ca2543", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"train/train.csv\")\n", + ").upload_file(\"train.csv\")\n", + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", + ").upload_file(\"validation.csv\")\n", + "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", + " os.path.join(bucket_prefix, \"test/test.csv\")\n", + ").upload_file(\"test.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "22de532f", + "metadata": {}, + "source": [ + "Get the XGBoost sagemaker image" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a41b6a7d", + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" + ] + }, + { + "cell_type": "markdown", + "id": "66cae2a9", + "metadata": {}, + "source": [ + "Transform our data to a sagemaker input for training" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e51c917a", + "metadata": {}, + "outputs": [], + "source": [ + "s3_input_train = sagemaker.inputs.TrainingInput(\n", + " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", + ")\n", + "s3_input_validation = sagemaker.inputs.TrainingInput(\n", + " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6f2985d8", + "metadata": {}, + "source": [ + "We define the XGBoost model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "92c1fe8c", + "metadata": {}, + "outputs": [], + "source": [ + "xgb = sagemaker.estimator.Estimator(\n", + " container,\n", + " role,\n", + " instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", + " sagemaker_session=sagemaker_session,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ecafdfe8", + "metadata": {}, + "source": [ + "Set the parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "582adc6c", + "metadata": {}, + "outputs": [], + "source": [ + "xgb.set_hyperparameters(\n", + " max_depth=5,\n", + " eta=0.2,\n", + " gamma=4,\n", + " min_child_weight=6,\n", + " subsample=0.8,\n", + " objective=\"multi:softprob\",\n", + " num_class=19,\n", + " verbosity=0,\n", + " num_round=100,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b36463dd", + "metadata": {}, + "source": [ + "And train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c24e06fc", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-25-14-12-01-149\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-01-25 14:12:01 Starting - Starting the training job...\n", + "2024-01-25 14:12:18 Starting - Preparing the instances for training.........\n", + "2024-01-25 14:13:58 Downloading - Downloading input data......\n", + "2024-01-25 14:14:34 Downloading - Downloading the training image...\n", + "2024-01-25 14:15:29 Training - Training image download completed. Training in progress...\u001b[34m[2024-01-25 14:15:41.041 ip-10-2-106-129.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:41.342 ip-10-2-106-129.ec2.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-01-25:14:15:41:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[0]#011train-merror:0.00047#011validation-merror:0.00050\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:42.380 ip-10-2-106-129.ec2.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-01-25 14:15:42.383 ip-10-2-106-129.ec2.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[1]#011train-merror:0.00023#011validation-merror:0.00040\u001b[0m\n", + "\u001b[34m[2]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[3]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[4]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[5]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[6]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[7]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[8]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[9]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[10]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[11]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[12]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[13]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[14]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[15]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[16]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[17]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[18]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[19]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[20]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[21]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[22]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[23]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[24]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[25]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[26]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[27]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[28]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[29]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[30]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[31]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[32]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", + "\u001b[34m[33]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[34]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[35]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[36]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[37]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[38]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[39]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[40]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[41]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[42]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[43]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[44]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[45]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[46]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[47]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[48]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[49]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[50]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[51]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[52]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[53]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[54]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[55]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[56]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[57]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[58]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[59]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[60]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[61]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[62]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[63]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[64]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[65]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[66]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[67]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[68]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[69]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[70]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[71]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[72]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[73]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[74]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[75]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[76]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[77]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[78]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[79]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[80]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[81]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[82]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[83]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[84]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[85]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[86]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[87]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[88]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[89]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\n", + "2024-01-25 14:17:00 Uploading - Uploading generated training model\u001b[34m[90]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[91]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[92]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[93]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[94]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[95]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[96]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[97]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[98]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\u001b[34m[99]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", + "\n", + "2024-01-25 14:17:16 Completed - Training job completed\n", + "Training seconds: 198\n", + "Billable seconds: 198\n" + ] + } + ], + "source": [ + "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" + ] + }, + { + "cell_type": "markdown", + "id": "8b716cd7", + "metadata": {}, + "source": [ + "### 5. Using the endpoint \n", + "\n", + "Deploy the model to an endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "78444d49-4ad3-49e4-a579-19b173facb26", + "metadata": {}, + "outputs": [], + "source": [ + "serving_function = project.get_function(\"serving\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "mlrun-flow\n", + "\n", + "\n", + "\n", + "_start\n", + "\n", + "start\n", + "\n", + "\n", + "\n", + "xgboost-model\n", + "\n", + "xgboost-model\n", + "\n", + "\n", + "\n", + "_start->xgboost-model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "postprocess\n", + "\n", + "postprocess\n", + "\n", + "\n", + "\n", + "xgboost-model->postprocess\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set the topology and get the graph object:\n", + "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", + "\n", + "# Add the steps:\n", + "graph.to(\"XGBModelServer\",\n", + " name=\"xgboost-model\",\n", + " model_path=xgb.model_data) \\\n", + " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", + "\n", + "# Plot to graph:\n", + "serving_function.plot(rankdir='LR')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-01-25 14:17:46,696 [info] Starting remote function deploy\n", + "2024-01-25 14:17:46 (info) Deploying function\n", + "2024-01-25 14:17:46 (info) Building\n", + "2024-01-25 14:17:47 (info) Staging files and preparing base images\n", + "2024-01-25 14:17:47 (info) Building processor image\n", + "2024-01-25 14:19:32 (info) Build complete\n", + "2024-01-25 14:19:40 (info) Function deploy complete\n", + "> 2024-01-25 14:19:48,105 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/']}\n" + ] + }, + { + "data": { + "text/plain": [ + "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/', 'name': 'sagemaker-yoni-serving'})" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.deploy_function(\"serving\")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c858e3e9-9e43-4148-8015-6047565db456", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'test_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m samples \u001b[38;5;241m=\u001b[39m \u001b[43mtest_data\u001b[49m\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtransaction_category\u001b[39m\u001b[38;5;124m'\u001b[39m,axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)[:\u001b[38;5;241m500\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\u001b[38;5;241m.\u001b[39mtolist()\n", + "\u001b[0;31mNameError\u001b[0m: name 'test_data' is not defined" + ] + } + ], + "source": [ + "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-01-25 14:19:48,167 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080/predict'}\n" + ] + } + ], + "source": [ + "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" + ] + }, + { + "cell_type": "markdown", + "id": "712f4d35", + "metadata": {}, + "source": [ + "### 6. Evaluate performance \n", + "\n", + "Run the model on our test data" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "2e863ea7-5804-4637-b677-390c305cabfe", + "metadata": {}, + "outputs": [], + "source": [ + "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" + ] + }, + { + "cell_type": "markdown", + "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", + "metadata": {}, + "source": [ + "Add the evaluation function to our project" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "ca4f7e49", + "metadata": {}, + "outputs": [], + "source": [ + "evaluate_function = project.get_function(\"evaluate\")" + ] + }, + { + "cell_type": "markdown", + "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", + "metadata": {}, + "source": [ + "Run the evaluation job" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-01-25 14:19:48,410 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': 'cac9cd3c55ba40d58fbe1156d4861e79', 'db': 'http://mlrun-api:8080'}\n", + "> 2024-01-25 14:19:48,708 [info] Job is running in the background, pod: evaluate-evaluate-5rrtk\n", + "[14:19:52] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", + "configuration generated by an older version of XGBoost, please export the model by calling\n", + "`Booster.save_model` from that version first, then load it back in current version. See:\n", + "\n", + " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", + "\n", + "for more details about differences between saving model and serializing.\n", + "\n", + "> 2024-01-25 14:19:53,802 [info] To track results use the CLI: {'info_cmd': 'mlrun get run cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni', 'logs_cmd': 'mlrun logs cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni'}\n", + "> 2024-01-25 14:19:53,802 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/sagemaker-yoni/jobs/monitor/cac9cd3c55ba40d58fbe1156d4861e79/overview'}\n", + "> 2024-01-25 14:19:53,803 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-yoni0Jan 25 14:19:51completedevaluate-evaluate
v3io_user=yoni
kind=job
owner=yoni
mlrun/client_version=1.6.0-rc21
mlrun/client_python_version=3.9.16
host=evaluate-evaluate-5rrtk
model_path=s3://sagemaker-us-east-1-934638699319/payment-classification/output/sagemaker-xgboost-2024-01-25-14-12-01-149/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-1-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
classification_report
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2024-01-25 14:19:59,831 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" + ] + } + ], + "source": [ + "evaluate_run = evaluate_function.run(\n", + " handler=\"evaluate\",\n", + " params={\n", + " \"model_path\": xgb.model_data,\n", + " \"model_name\": \"xgboost-model\",\n", + " \"test_set\": s3_data,\n", + " \"label_column\": \"transaction_category\",\n", + " \"factorize_key\": factorize_key,\n", + " },\n", + " returns=[\"classification_report: dataset\"])" + ] + }, + { + "cell_type": "markdown", + "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", + "metadata": {}, + "source": [ + "See the evaluation result" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precisionrecallf1-scoresupport
Uncategorized1.0000001.0000001.00000051.0000
Entertainment1.0000001.0000001.0000001486.0000
Education1.0000001.0000001.00000080.0000
Shopping1.0000001.0000001.0000003441.0000
Personal Care1.0000001.0000001.000000132.0000
Health and Fitness1.0000001.0000001.000000443.0000
Food and Dining1.0000001.0000001.000000918.0000
Gifts and Donations1.0000001.0000001.000000275.0000
Investments1.0000001.0000001.00000088.0000
Bills and Utilities1.0000001.0000001.000000332.0000
Auto and Transport1.0000001.0000001.0000001967.0000
Travel1.0000001.0000001.000000120.0000
Fees and Charges1.0000001.0000001.000000106.0000
Business Services1.0000001.0000001.000000146.0000
Personal Services1.0000001.0000001.00000075.0000
Taxes1.0000000.9787230.98924747.0000
Gambling0.9375001.0000000.96774215.0000
Home1.0000001.0000001.000000168.0000
Pension and insurances1.0000001.0000001.000000110.0000
accuracy0.9999000.9999000.9999000.9999
macro avg0.9967110.9988800.99773610000.0000
weighted avg0.9999060.9999000.99990110000.0000
\n", + "
" + ], + "text/plain": [ + " precision recall f1-score support\n", + "Uncategorized 1.000000 1.000000 1.000000 51.0000\n", + "Entertainment 1.000000 1.000000 1.000000 1486.0000\n", + "Education 1.000000 1.000000 1.000000 80.0000\n", + "Shopping 1.000000 1.000000 1.000000 3441.0000\n", + "Personal Care 1.000000 1.000000 1.000000 132.0000\n", + "Health and Fitness 1.000000 1.000000 1.000000 443.0000\n", + "Food and Dining 1.000000 1.000000 1.000000 918.0000\n", + "Gifts and Donations 1.000000 1.000000 1.000000 275.0000\n", + "Investments 1.000000 1.000000 1.000000 88.0000\n", + "Bills and Utilities 1.000000 1.000000 1.000000 332.0000\n", + "Auto and Transport 1.000000 1.000000 1.000000 1967.0000\n", + "Travel 1.000000 1.000000 1.000000 120.0000\n", + "Fees and Charges 1.000000 1.000000 1.000000 106.0000\n", + "Business Services 1.000000 1.000000 1.000000 146.0000\n", + "Personal Services 1.000000 1.000000 1.000000 75.0000\n", + "Taxes 1.000000 0.978723 0.989247 47.0000\n", + "Gambling 0.937500 1.000000 0.967742 15.0000\n", + "Home 1.000000 1.000000 1.000000 168.0000\n", + "Pension and insurances 1.000000 1.000000 1.000000 110.0000\n", + "accuracy 0.999900 0.999900 0.999900 0.9999\n", + "macro avg 0.996711 0.998880 0.997736 10000.0000\n", + "weighted avg 0.999906 0.999900 0.999901 10000.0000" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate_run.artifact(\"classification_report\").as_df()" + ] + }, + { + "cell_type": "markdown", + "id": "98d0b67e", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You should see results similar to this:\n", + "\n", + "```\n", + " precision recall f1-score support\n", + "\n", + " Uncategorized 1.00 0.92 0.96 51\n", + " Entertainment 0.81 0.89 0.85 1486\n", + " Education 1.00 0.94 0.97 80\n", + " Shopping 0.86 0.94 0.90 3441\n", + " Personal Care 1.00 0.98 0.99 132\n", + " Health and Fitness 0.99 0.89 0.94 443\n", + " Food and Dining 0.99 0.82 0.90 918\n", + " Gifts and Donations 1.00 0.95 0.97 275\n", + " Investments 0.99 0.97 0.98 88\n", + " Bills and Utilities 1.00 0.99 1.00 332\n", + " Auto and Transport 0.94 0.84 0.88 1967\n", + " Travel 0.96 0.84 0.90 120\n", + " Fees and Charges 1.00 0.94 0.97 106\n", + " Business Services 1.00 0.99 1.00 146\n", + " Personal Services 1.00 0.96 0.98 75\n", + " Taxes 0.98 0.94 0.96 47\n", + " Gambling 1.00 1.00 1.00 15\n", + " Home 0.98 0.89 0.93 168\n", + "Pension and insurances 0.99 1.00 1.00 110\n", + "\n", + " accuracy 0.90 10000\n", + " macro avg 0.97 0.93 0.95 10000\n", + " weighted avg 0.91 0.90 0.90 10000\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "49fdc82d", + "metadata": {}, + "source": [ + "### 7. Clean up \n", + "\n", + "Remove the feature group and endpoint to clean up" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f79b1164", + "metadata": {}, + "outputs": [], + "source": [ + "#feature_group.delete()\n", + "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e04b6fa6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Notebook CI Test Results\n", + "\n", + "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", + "\n", + "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", + "\n", + "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "smdemo", + "language": "python", + "name": "smdemo" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/functions/evaluate.py b/src/functions/evaluate.py index c905379..83e205a 100644 --- a/src/functions/evaluate.py +++ b/src/functions/evaluate.py @@ -56,6 +56,8 @@ def evaluate( # convert to pandas dataframe: test_set = pd.read_csv(test_set_temp_path) + print(test_set) + # convert to xgboost object: test_data = xgb.DMatrix(test_set.drop(columns=[label_column], axis=1)) @@ -67,7 +69,7 @@ def evaluate( # generate classification report: report = classification_report( - y_true=test_set["transaction_category_mapped"].to_list(), + y_true=test_set["transaction_category"].to_list(), y_pred=predictions, target_names=factorize_key, output_dict=True, From b5f11c180692bec64654490986653e9b32870720 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 12 Feb 2024 13:40:36 +0000 Subject: [PATCH 13/16] cleanning development notebooks --- financial_payment_classification_v2.ipynb | 435 ++-------- serving-Copy1.ipynb | 540 ------------ serving.ipynb | 955 ---------------------- src/functions/serving.py | 5 + 4 files changed, 77 insertions(+), 1858 deletions(-) delete mode 100644 serving-Copy1.ipynb delete mode 100644 serving.ipynb diff --git a/financial_payment_classification_v2.ipynb b/financial_payment_classification_v2.ipynb index 7310897..cb49af7 100644 --- a/financial_payment_classification_v2.ipynb +++ b/financial_payment_classification_v2.ipynb @@ -108,7 +108,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-06 08:47:06,901 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" + "> 2024-02-06 12:51:51,361 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" ] } ], @@ -675,7 +675,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 14, @@ -877,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 16, "id": "50441ed4-a228-44e7-87ce-024177b928f6", "metadata": {}, "outputs": [], @@ -903,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "19081c06-240e-481b-bfe3-588bb77bd54e", "metadata": {}, "outputs": [], @@ -959,7 +959,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "8993721e-f0e5-4438-ab55-2f9bfb78e20a", "metadata": {}, "outputs": [ @@ -1025,10 +1025,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1055,7 +1055,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", "metadata": {}, "outputs": [ @@ -1063,7 +1063,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-06 08:03:53,056 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" + "> 2024-02-06 13:03:26,132 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" ] }, { @@ -1436,7 +1436,7 @@ "[99997 rows x 30 columns]" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1486,7 +1486,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "bb4bdd8d", "metadata": {}, "outputs": [], @@ -1507,7 +1507,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "f849a7a9", "metadata": {}, "outputs": [], @@ -1527,7 +1527,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "id": "e1ca2543", "metadata": {}, "outputs": [], @@ -1553,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "id": "a41b6a7d", "metadata": {}, "outputs": [], @@ -1571,7 +1571,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "id": "e51c917a", "metadata": {}, "outputs": [], @@ -1594,7 +1594,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "92c1fe8c", "metadata": {}, "outputs": [], @@ -1619,7 +1619,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "id": "582adc6c", "metadata": {}, "outputs": [], @@ -1647,7 +1647,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "id": "c24e06fc", "metadata": { "scrolled": true @@ -1657,39 +1657,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-06-08-05-13-165\n" + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-06-13-03-44-059\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-02-06 08:05:13 Starting - Starting the training job......\n", - "2024-02-06 08:05:48 Starting - Preparing the instances for training...\n", - "2024-02-06 08:06:39 Downloading - Downloading input data...\n", - "2024-02-06 08:07:09 Downloading - Downloading the training image......\n", - "2024-02-06 08:07:59 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-06 08:08:16.516 ip-10-0-135-116.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", + "2024-02-06 13:03:44 Starting - Starting the training job...\n", + "2024-02-06 13:04:08 Starting - Preparing the instances for training.........\n", + "2024-02-06 13:05:26 Downloading - Downloading input data...\n", + "2024-02-06 13:05:56 Downloading - Downloading the training image......\n", + "2024-02-06 13:06:51 Training - Training image download completed. Training in progress..\u001b[34m[2024-02-06 13:07:07.762 ip-10-0-153-109.us-east-2.compute.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:16.809 ip-10-0-135-116.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:16.809 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:16.810 ip-10-0-135-116.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:16.811 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:16.811 ip-10-0-135-116.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-06:08:08:16:INFO] Debug hook created from config\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:07:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:08:INFO] Single node training.\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:08:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:08:INFO] Validation matrix has 20000 rows\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:08.056 ip-10-0-153-109.us-east-2.compute.internal:8 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:08.057 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:08.057 ip-10-0-153-109.us-east-2.compute.internal:8 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:08.058 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:08.058 ip-10-0-153-109.us-east-2.compute.internal:8 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", + "\u001b[34m[2024-02-06:13:07:08:INFO] Debug hook created from config\u001b[0m\n", "\u001b[34m[0]#011train-merror:0.54515#011validation-merror:0.55430\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:18.833 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-06 08:08:18.836 ip-10-0-135-116.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:10.007 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", + "\u001b[34m[2024-02-06 13:07:10.012 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:486] Hook is writing from the hook with pid: 8\u001b[0m\n", "\u001b[34m[1]#011train-merror:0.53387#011validation-merror:0.54255\u001b[0m\n", "\u001b[34m[2]#011train-merror:0.52198#011validation-merror:0.53050\u001b[0m\n", "\u001b[34m[3]#011train-merror:0.51036#011validation-merror:0.52010\u001b[0m\n", @@ -1782,7 +1782,8 @@ "\u001b[34m[90]#011train-merror:0.12629#011validation-merror:0.14090\u001b[0m\n", "\u001b[34m[91]#011train-merror:0.12568#011validation-merror:0.14010\u001b[0m\n", "\u001b[34m[92]#011train-merror:0.12215#011validation-merror:0.13690\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.11968#011validation-merror:0.13450\u001b[0m\n", + "\n", + "2024-02-06 13:10:17 Uploading - Uploading generated training model\u001b[34m[93]#011train-merror:0.11968#011validation-merror:0.13450\u001b[0m\n", "\u001b[34m[94]#011train-merror:0.11878#011validation-merror:0.13360\u001b[0m\n", "\u001b[34m[95]#011train-merror:0.11785#011validation-merror:0.13240\u001b[0m\n", "\u001b[34m[96]#011train-merror:0.11631#011validation-merror:0.13090\u001b[0m\n", @@ -1790,10 +1791,9 @@ "\u001b[34m[98]#011train-merror:0.11213#011validation-merror:0.12605\u001b[0m\n", "\u001b[34m[99]#011train-merror:0.11039#011validation-merror:0.12445\u001b[0m\n", "\n", - "2024-02-06 08:11:41 Uploading - Uploading generated training model\n", - "2024-02-06 08:11:41 Completed - Training job completed\n", - "Training seconds: 303\n", - "Billable seconds: 303\n" + "2024-02-06 13:10:33 Completed - Training job completed\n", + "Training seconds: 308\n", + "Billable seconds: 308\n" ] } ], @@ -1813,17 +1813,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-06-08-05-13-165/output/model.tar.gz'" + "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-06-13-03-44-059/output/model.tar.gz'" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1834,7 +1834,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "id": "78444d49-4ad3-49e4-a579-19b173facb26", "metadata": {}, "outputs": [], @@ -1844,7 +1844,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", "metadata": {}, "outputs": [ @@ -1908,10 +1908,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 31, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1933,7 +1933,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", "metadata": {}, "outputs": [ @@ -1941,12 +1941,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-02-06 08:13:22,438 [info] Starting remote function deploy\n", - "2024-02-06 08:13:22 (info) Deploying function\n", - "2024-02-06 08:13:22 (info) Building\n", - "2024-02-06 08:13:23 (info) Staging files and preparing base images\n", - "2024-02-06 08:13:23 (info) Building processor image\n", - "2024-02-06 08:15:18 (info) Build complete\n", + "> 2024-02-06 13:10:58,760 [info] Starting remote function deploy\n", + "2024-02-06 13:10:59 (info) Deploying function\n", + "2024-02-06 13:10:59 (info) Building\n", + "2024-02-06 13:10:59 (info) Staging files and preparing base images\n", + "2024-02-06 13:10:59 (info) Building processor image\n", + "2024-02-06 13:12:04 (info) Build complete\n", "Failed to deploy. Details:\n", "Traceback (most recent call last):\n", " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", @@ -2012,7 +2012,7 @@ " await self._initialize_context()\n", " File \"/opt/nuclio/_nuclio_wrapper.py\", line 188, in _initialize_context\n", " init_context_result = getattr(self._entrypoint_module, 'init_context')(self._context)\n", - " File \"/opt/nuclio/serving.py\", line 116, in init_context\n", + " File \"/opt/nuclio/serving.py\", line 135, in init_context\n", " nuclio_init_hook(context, globals(), 'serving_v2')\n", " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/nuclio.py\", line 34, in nuclio_init_hook\n", " v2_serving_init(context, data)\n", @@ -2031,7 +2031,7 @@ " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 131, in _load_and_update_state\n", " raise RuntimeError(f\"failed to load model {self.name}\") from exc\n", "RuntimeError: failed to load model xgboost-model\n", - "> 2024-02-06 08:15:44,441 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" + "> 2024-02-06 13:12:30,976 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" ] }, { @@ -2041,7 +2041,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[32], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[31], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", @@ -2057,7 +2057,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "c858e3e9-9e43-4148-8015-6047565db456", "metadata": {}, "outputs": [], @@ -2067,18 +2067,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-04 15:52:51,734 [info] invoking function: {'method': 'POST', 'path': 'http://sagemaker-admin-serving-sagemaker-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//predict'}\n" - ] - } - ], + "outputs": [], "source": [ "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" ] @@ -2095,7 +2087,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "2e863ea7-5804-4637-b677-390c305cabfe", "metadata": {}, "outputs": [], @@ -2113,7 +2105,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "ca4f7e49", "metadata": {}, "outputs": [], @@ -2131,293 +2123,10 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-04 15:52:51,982 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '6c838c8bbb234d7eb642d9401eff7068', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", - "> 2024-02-04 15:52:52,382 [info] Job is running in the background, pod: evaluate-evaluate-hdk2c\n", - "> 2024-02-04 15:52:56,798 [error] Execution error, Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/local.py\", line 475, in exec_from_params\n", - " val = mlrun.handler(\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/package/__init__.py\", line 140, in wrapper\n", - " func_outputs = func(*args, **kwargs)\n", - " File \"evaluate.py\", line 44, in evaluate\n", - " model_temp_path = _download_object_from_s3(model_path, suffix=\".tar.gz\")\n", - " File \"evaluate.py\", line 88, in _download_object_from_s3\n", - " obj.download(temp_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", - " self._store.download(self._path, target_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", - " data = self.get(key)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", - " return obj.get()[\"Body\"].read()\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", - " response = action(self, *args, **kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", - " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", - " return self._make_api_call(operation_name, kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", - " raise error_class(parsed_response, operation_name)\n", - "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - "\n", - "> 2024-02-04 15:52:56,835 [error] Exec error - An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - "An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - "> 2024-02-04 15:52:56,879 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 6c838c8bbb234d7eb642d9401eff7068 -p sagemaker-admin', 'logs_cmd': 'mlrun logs 6c838c8bbb234d7eb642d9401eff7068 -p sagemaker-admin'}\n", - "> 2024-02-04 15:52:56,879 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-admin/jobs/monitor/6c838c8bbb234d7eb642d9401eff7068/overview'}\n", - "> 2024-02-04 15:52:56,880 [info] Run execution finished: {'status': 'error', 'name': 'evaluate-evaluate'}\n", - "Runtime error: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-admin0Feb 04 15:52:55
error
evaluate-evaluate
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc22
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-hdk2c
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-04 15:53:03,539 [info] Run execution finished: {'status': 'error', 'name': 'evaluate-evaluate'}\n", - "> 2024-02-04 15:53:03,540 [error] Run did not finish successfully: {'state': 'error', 'status': {'state': 'error', 'error': 'An error occurred (AccessDenied) when calling the GetObject operation: Access Denied', 'artifacts': [], 'start_time': '2024-02-04T15:52:55.840221+00:00', 'last_update': '2024-02-04T15:52:56.873792+00:00'}}\n" - ] - }, - { - "ename": "RunError", - "evalue": "An error occurred (AccessDenied) when calling the GetObject operation: Access Denied", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[37], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m evaluate_run \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate_function\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mevaluate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_path\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mxgb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mxgboost-model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtest_set\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43ms3_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabel_column\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtransaction_category\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfactorize_key\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mfactorize_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclassification_report: dataset\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/base.py:369\u001b[0m, in \u001b[0;36mBaseRuntime.run\u001b[0;34m(self, runspec, handler, name, project, params, inputs, out_path, workdir, artifact_path, watch, schedule, hyperparams, hyper_param_options, verbose, scrape_metrics, local, local_code_path, auto_build, param_file_secrets, notifications, returns, state_thresholds, **launcher_kwargs)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 313\u001b[0m \u001b[38;5;124;03mRun a local or remote task.\u001b[39;00m\n\u001b[1;32m 314\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;124;03m:return: Run context object (RunObject) with run metadata, results and status\u001b[39;00m\n\u001b[1;32m 365\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 366\u001b[0m launcher \u001b[38;5;241m=\u001b[39m mlrun\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mfactory\u001b[38;5;241m.\u001b[39mLauncherFactory()\u001b[38;5;241m.\u001b[39mcreate_launcher(\n\u001b[1;32m 367\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_remote, local\u001b[38;5;241m=\u001b[39mlocal, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mlauncher_kwargs\n\u001b[1;32m 368\u001b[0m )\n\u001b[0;32m--> 369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlauncher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m \u001b[49m\u001b[43mruntime\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrunspec\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandler\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhandler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 376\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 377\u001b[0m \u001b[43m \u001b[49m\u001b[43mout_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mout_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 378\u001b[0m \u001b[43m \u001b[49m\u001b[43mworkdir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mworkdir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 379\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43mwatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 381\u001b[0m \u001b[43m \u001b[49m\u001b[43mschedule\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 382\u001b[0m \u001b[43m \u001b[49m\u001b[43mhyperparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhyperparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 383\u001b[0m \u001b[43m \u001b[49m\u001b[43mhyper_param_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhyper_param_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mscrape_metrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscrape_metrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_code_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_code_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43mauto_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mauto_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mparam_file_secrets\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparam_file_secrets\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mnotifications\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnotifications\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_thresholds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstate_thresholds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/launcher/remote.py:113\u001b[0m, in \u001b[0;36mClientRemoteLauncher.launch\u001b[0;34m(self, runtime, task, handler, name, project, params, inputs, out_path, workdir, artifact_path, watch, schedule, hyperparams, hyper_param_options, verbose, scrape_metrics, local_code_path, auto_build, param_file_secrets, notifications, returns, state_thresholds)\u001b[0m\n\u001b[1;32m 105\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 106\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStoring function\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 107\u001b[0m name\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m 108\u001b[0m uid\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39muid,\n\u001b[1;32m 109\u001b[0m db\u001b[38;5;241m=\u001b[39mruntime\u001b[38;5;241m.\u001b[39mspec\u001b[38;5;241m.\u001b[39mrundb,\n\u001b[1;32m 110\u001b[0m )\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store_function(runtime, run)\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_submit_job\u001b[49m\u001b[43m(\u001b[49m\u001b[43mruntime\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mschedule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwatch\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/launcher/remote.py:182\u001b[0m, in \u001b[0;36mClientRemoteLauncher._submit_job\u001b[0;34m(self, runtime, run, schedule, watch)\u001b[0m\n\u001b[1;32m 179\u001b[0m run\u001b[38;5;241m.\u001b[39mlogs(\u001b[38;5;28;01mTrue\u001b[39;00m, runtime\u001b[38;5;241m.\u001b[39m_get_db())\n\u001b[1;32m 180\u001b[0m resp \u001b[38;5;241m=\u001b[39m runtime\u001b[38;5;241m.\u001b[39m_get_db_run(run)\n\u001b[0;32m--> 182\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wrap_run_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43mruntime\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mschedule\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mschedule\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/launcher/base.py:409\u001b[0m, in \u001b[0;36mBaseLauncher._wrap_run_result\u001b[0;34m(self, runtime, result, run, schedule, err)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m runtime\u001b[38;5;241m.\u001b[39m_is_remote \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m runtime\u001b[38;5;241m.\u001b[39mis_child:\n\u001b[1;32m 404\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\n\u001b[1;32m 405\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRun did not finish successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 406\u001b[0m state\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 407\u001b[0m status\u001b[38;5;241m=\u001b[39mrun\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mto_dict(),\n\u001b[1;32m 408\u001b[0m )\n\u001b[0;32m--> 409\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mRunError(run\u001b[38;5;241m.\u001b[39merror)\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m run\n\u001b[1;32m 412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "\u001b[0;31mRunError\u001b[0m: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied" - ] - } - ], + "outputs": [], "source": [ "evaluate_run = evaluate_function.run(\n", " handler=\"evaluate\",\n", diff --git a/serving-Copy1.ipynb b/serving-Copy1.ipynb deleted file mode 100644 index 78ab73d..0000000 --- a/serving-Copy1.ipynb +++ /dev/null @@ -1,540 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 24, - "id": "1b3d7eb9-b601-47b4-a914-191e5bcf2764", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "41fa803e-cd2c-46ff-ba0c-6ff6d7b0b92c", - "metadata": {}, - "outputs": [], - "source": [ - "#import sys\n", - "#!{sys.executable} -m pip install --upgrade xgboost --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "2c7bb858-9603-4c67-92c0-722b0cf24714", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-08 17:39:51,937 [info] Project loaded successfully: {'project_name': 'sagemaker-v3'}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v3\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "74dab54c-6348-4a18-9db5-5d8074370fb0", - "metadata": {}, - "outputs": [], - "source": [ - "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-08-17-31-38-814/output/model.tar.gz'" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "3ee1d9bd-2652-4349-8df5-e231edb6acfa", - "metadata": {}, - "outputs": [], - "source": [ - "test_serving_function = project.set_function(\n", - " func=\"src/functions/serving.py\",\n", - " name=\"test-serving\",\n", - " kind=\"serving\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "3da03265-204e-4600-8746-adc81f7ce3bf", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "data = pd.read_csv(\n", - " \"test.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "b004f4de-cab6-47b6-b786-07b0601eac82", - "metadata": {}, - "outputs": [], - "source": [ - "data_cols = list(data.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5122852e-74b7-409b-b7c8-0941d22ba2d2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['transaction_category_key',\n", - " 'amount_avg_1d',\n", - " 'receiver_id',\n", - " 'sender_id',\n", - " 'amount',\n", - " 'timestamp_year',\n", - " 'timestamp_month',\n", - " 'timestamp_day',\n", - " 'timestamp_hour',\n", - " 'timestamp_minute',\n", - " 'timestamp_second',\n", - " 'distance']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_cols" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "82817363-e449-4c0b-8527-1642b74c9406", - "metadata": {}, - "outputs": [], - "source": [ - "# # Set the topology and get the graph object:\n", - "# graph = test_serving_function.set_topology(\"flow\", engine=\"async\")\n", - "\n", - "# # Add the steps:\n", - "# graph.to(\"XGBModelServer\",\n", - "# name=\"xgboost-model\",\n", - "# model_path=model_path) \\\n", - "# .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", - "\n", - "# # Plot to graph:\n", - "# test_serving_function.plot(rankdir='LR')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "e07c6b0b-7c9a-4c44-bb08-ea8a865a043c", - "metadata": {}, - "outputs": [], - "source": [ - "#print(test_serving_function.spec.to_yaml())" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "6a291b9c-0acc-4807-ab8e-4bec180a2bbf", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_start->\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "graph = test_serving_function.set_topology(\n", - " \"router\",\n", - " mlrun.serving.routers.EnrichmentModelRouter(\n", - " feature_vector_uri=\"store://feature-vectors/sagemaker-v3-admin/transactions-vector:latest\",\n", - " impute_policy={\"*\": \"$mean\"}),\n", - ")\n", - "\n", - "#graph.to(handler=\"postprocess\", name=\"postprocess\").respond()\n", - "# # add the 3 trained models to the Ensemble\n", - "# for model in project.list_models('', tag='latest'):\n", - "# name = model.spec.db_key\n", - "# serving_fn.add_model(name, class_name=\"ClassifierModel\", model_path=model.uri)\n", - "\n", - "test_serving_function.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=model_path)\n", - "\n", - "# Plot the ensemble configuration\n", - "test_serving_function.spec.graph.plot()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58874abf-f7c8-4f40-83d0-8beb92dfd550", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "fb976023-5a2c-4dc8-b1b7-fd897446b747", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-08 17:41:19,924 [info] model xgboost-model was loaded\n", - "> 2024-02-08 17:41:19,925 [info] Loaded ['xgboost-model']\n" - ] - } - ], - "source": [ - "server = test_serving_function.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "35e98782-129d-4ffb-b27e-d580589d6106", - "metadata": {}, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "# data = pd.read_csv(\n", - "# \"test.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "8f67e627-cfbf-4b8b-a5b3-7b9e836f779a", - "metadata": {}, - "outputs": [], - "source": [ - "# data = data[:1]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "90abdb9d-3140-45eb-9a4d-a83a74b95700", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryamount_avg_1dreceiver_idsender_idamounttimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
0Shopping67.184630518417004166489829033178327867.18202311716343033.59
\n", - "
" - ], - "text/plain": [ - " transaction_category amount_avg_1d receiver_id sender_id \\\n", - "0 Shopping 67.18 4630518417004166 4898290331783278 \n", - "\n", - " amount timestamp_year timestamp_month timestamp_day timestamp_hour \\\n", - "0 67.18 2023 11 7 16 \n", - "\n", - " timestamp_minute timestamp_second distance \n", - "0 34 30 33.59 " - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# data" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "09081ad4-e6d9-4ed8-9d47-ee0175bd291e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[53.07,\n", - " 4726536548206059.0,\n", - " 4070478627221885.0,\n", - " 53.07,\n", - " 2021.0,\n", - " 2.0,\n", - " 19.0,\n", - " 19.0,\n", - " 37.0,\n", - " 40.0,\n", - " 26.535]]" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# data_ls = data.values.tolist()\n", - "# data_ls" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "d9b9df7b-3fce-4e2b-b739-2a845ae1df30", - "metadata": {}, - "outputs": [], - "source": [ - "#inputs_data = {'inputs': data_ls }" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "dff40293-9d50-400c-9a1b-62a7e610e176", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'inputs': [['Pension and insurances', 207.92, 4627516674144704, 4250420705087194, 207.92, 2024, 2, 8, 17, 19, 20, 103.96]]}\n", - "> 2024-02-08 17:42:48,779 [error] run error, Traceback (most recent call last):\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py\", line 280, in run\n", - " response = self.graph.run(event, **(extra_args or {}))\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 548, in run\n", - " raise exc\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 531, in run\n", - " return self._handler(event, *args, **kwargs)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 151, in do_event\n", - " event = self.postprocess(self._handle_event(event))\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/routers.py\", line 209, in _handle_event\n", - " response = route.run(event)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 548, in run\n", - " raise exc\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/states.py\", line 531, in run\n", - " return self._handler(event, *args, **kwargs)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 249, in do_event\n", - " raise exc\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 244, in do_event\n", - " outputs = self.predict(request)\n", - " File \"src/functions/serving.py\", line 39, in predict\n", - " data = xgb.DMatrix(data)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/core.py\", line 730, in inner_f\n", - " return func(**kwargs)\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/core.py\", line 857, in __init__\n", - " handle, feature_names, feature_types = dispatch_data_backend(\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/data.py\", line 1075, in dispatch_data_backend\n", - " return _from_numpy_array(\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/data.py\", line 207, in _from_numpy_array\n", - " _check_call(\n", - " File \"/home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/core.py\", line 282, in _check_call\n", - " raise XGBoostError(py_str(_LIB.XGBGetLastError()))\n", - "xgboost.core.XGBoostError: [17:42:48] /workspace/src/c_api/../data/array_interface.h:492: Unicode-3 is not supported.\n", - "Stack trace:\n", - " [bt] (0) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x15c2ca) [0x7f406c4212ca]\n", - " [bt] (1) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x19ff29) [0x7f406c464f29]\n", - " [bt] (2) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x16c) [0x7f406c435a6c]\n", - " [bt] (3) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7f411ac7f052]\n", - " [bt] (4) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7f411ac7d925]\n", - " [bt] (5) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f411ac7e06e]\n", - " [bt] (6) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7f411ac8f1e0]\n", - " [bt] (7) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x8568) [0x7f411ac8e568]\n", - " [bt] (8) /home/sagemaker-user/.conda/envs/smdemo/bin/python3.9(_PyObject_MakeTpCall+0x2ec) [0x4f073c]\n", - "\n", - "\n", - "\n" - ] - }, - { - "ename": "RuntimeError", - "evalue": "failed (400): XGBoostError: [17:42:48] /workspace/src/c_api/../data/array_interface.h:492: Unicode-3 is not supported.\nStack trace:\n [bt] (0) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x15c2ca) [0x7f406c4212ca]\n [bt] (1) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x19ff29) [0x7f406c464f29]\n [bt] (2) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x16c) [0x7f406c435a6c]\n [bt] (3) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7f411ac7f052]\n [bt] (4) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7f411ac7d925]\n [bt] (5) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f411ac7e06e]\n [bt] (6) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7f411ac8f1e0]\n [bt] (7) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x8568) [0x7f411ac8e568]\n [bt] (8) /home/sagemaker-user/.conda/envs/smdemo/bin/python3.9(_PyObject_MakeTpCall+0x2ec) [0x4f073c]\n\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[37], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43minputs\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m99996\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/serving/server.py:250\u001b[0m, in \u001b[0;36mGraphServer.test\u001b[0;34m(self, path, body, method, headers, content_type, silent, get_body, event_id, trigger, offset, time)\u001b[0m\n\u001b[1;32m 248\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun(event, get_body\u001b[38;5;241m=\u001b[39mget_body)\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(resp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus_code\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m resp\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m300\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m silent:\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfailed (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", - "\u001b[0;31mRuntimeError\u001b[0m: failed (400): XGBoostError: [17:42:48] /workspace/src/c_api/../data/array_interface.h:492: Unicode-3 is not supported.\nStack trace:\n [bt] (0) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x15c2ca) [0x7f406c4212ca]\n [bt] (1) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x19ff29) [0x7f406c464f29]\n [bt] (2) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGDMatrixCreateFromDense+0x16c) [0x7f406c435a6c]\n [bt] (3) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7f411ac7f052]\n [bt] (4) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7f411ac7d925]\n [bt] (5) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f411ac7e06e]\n [bt] (6) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7f411ac8f1e0]\n [bt] (7) /home/sagemaker-user/.conda/envs/smdemo/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x8568) [0x7f411ac8e568]\n [bt] (8) /home/sagemaker-user/.conda/envs/smdemo/bin/python3.9(_PyObject_MakeTpCall+0x2ec) [0x4f073c]\n\n" - ] - } - ], - "source": [ - "response = server.test(body={'inputs':[[99996]]})" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "ef10a992-7fce-424f-8733-f1eb190f7c42", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'response' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresponse\u001b[49m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'response' is not defined" - ] - } - ], - "source": [ - "print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "381d3223-4ff0-454f-b0b7-1ed9589faca5", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "smdemo", - "language": "python", - "name": "smdemo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/serving.ipynb b/serving.ipynb deleted file mode 100644 index 2eb5e98..0000000 --- a/serving.ipynb +++ /dev/null @@ -1,955 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "1b3d7eb9-b601-47b4-a914-191e5bcf2764", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "41fa803e-cd2c-46ff-ba0c-6ff6d7b0b92c", - "metadata": {}, - "outputs": [], - "source": [ - "#import sys\n", - "#!{sys.executable} -m pip install --upgrade xgboost --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "2c7bb858-9603-4c67-92c0-722b0cf24714", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-06 08:34:06,016 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v2\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "74dab54c-6348-4a18-9db5-5d8074370fb0", - "metadata": {}, - "outputs": [], - "source": [ - "model_path = 's3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-04-15-43-54-687/output/model.tar.gz'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3ee1d9bd-2652-4349-8df5-e231edb6acfa", - "metadata": {}, - "outputs": [], - "source": [ - "test_serving_function = project.set_function(\n", - " func=\"src/functions/serving.py\",\n", - " name=\"test-serving\",\n", - " kind=\"serving\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6a291b9c-0acc-4807-ab8e-4bec180a2bbf", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "preprocess\n", - "\n", - "preprocess\n", - "\n", - "\n", - "\n", - "_start->preprocess\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set the topology and get the graph object:\n", - "graph = test_serving_function.set_topology(\"flow\", engine=\"async\")\n", - "\n", - "# Add the steps:\n", - "graph.to(handler=\"preprocess\", name=\"preprocess\").respond()\n", - "\n", - " # .to(\"XGBModelServer\",\n", - " # name=\"xgboost-model\",\n", - " # model_path=model_path) \\\n", - " # .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", - "\n", - "# Plot to graph:\n", - "test_serving_function.plot(rankdir='LR')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fb976023-5a2c-4dc8-b1b7-fd897446b747", - "metadata": {}, - "outputs": [], - "source": [ - "server = test_serving_function.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "35e98782-129d-4ffb-b27e-d580589d6106", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "90ae1e7b-8e46-418b-b057-76071a22a8c1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamounttimestamp
0Uncategorized45185519044999194333582346477646833.262021-03-10 19:57:42
1Uncategorized45185519044999194642413144038776596.632021-02-11 17:53:32
2Uncategorized42745440229395224952665515556751176.762021-02-21 18:29:32
3Uncategorized45185519044999194457298962882528879.782021-04-09 16:14:19
4Uncategorized46018532461252204578126462896710742.252021-04-04 15:50:16
..................
99992Pension and insurances44050083552203244583355906735225205.432021-04-20 12:23:53
99993Pension and insurances43004167445113354949240916846171151.492021-03-24 19:30:18
99994Pension and insurances44050083552203244996896020767264188.282021-03-08 19:51:10
99995Pension and insurances42620471944990064017367486513464204.262021-02-14 23:25:07
99996Pension and insurances46275166741447044250420705087194207.922021-04-14 00:42:00
\n", - "

99997 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "0 Uncategorized 4518551904499919 4333582346477646 833.26 \n", - "1 Uncategorized 4518551904499919 4642413144038776 596.63 \n", - "2 Uncategorized 4274544022939522 4952665515556751 176.76 \n", - "3 Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "4 Uncategorized 4601853246125220 4578126462896710 742.25 \n", - "... ... ... ... ... \n", - "99992 Pension and insurances 4405008355220324 4583355906735225 205.43 \n", - "99993 Pension and insurances 4300416744511335 4949240916846171 151.49 \n", - "99994 Pension and insurances 4405008355220324 4996896020767264 188.28 \n", - "99995 Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "99996 Pension and insurances 4627516674144704 4250420705087194 207.92 \n", - "\n", - " timestamp \n", - "0 2021-03-10 19:57:42 \n", - "1 2021-02-11 17:53:32 \n", - "2 2021-02-21 18:29:32 \n", - "3 2021-04-09 16:14:19 \n", - "4 2021-04-04 15:50:16 \n", - "... ... \n", - "99992 2021-04-20 12:23:53 \n", - "99993 2021-03-24 19:30:18 \n", - "99994 2021-03-08 19:51:10 \n", - "99995 2021-02-14 23:25:07 \n", - "99996 2021-04-14 00:42:00 \n", - "\n", - "[99997 rows x 5 columns]" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "cf821aae-83e4-4cc7-ba4a-b3038f7fd954", - "metadata": {}, - "outputs": [], - "source": [ - "data['transaction_id'] = data.reset_index().index" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "057e2627-588b-464e-aa67-5f9daf209d5c", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "test_data = pd.read_csv(\n", - " \"test.csv\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "fda9453e-c423-4c39-9c93-1ce93e42c38a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'receiver_id': 4518551904499919,\n", - " 'sender_id': 4333582346477646,\n", - " 'amount': 833.26,\n", - " 'timestamp': Timestamp('2021-03-10 19:57:42'),\n", - " 'transaction_id': 0}]" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "first_event_data = data.drop('transaction_category',axis=1)[:1].to_dict('records')\n", - "first_event_data" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "0ba6eef2-b37c-4db0-adc1-a93b9b9246ab", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun.feature_store as fstore\n", - "def get_realtime_transactions_aggregations():\n", - " # Create a feature vector that gets the average amount\n", - " vector = fstore.FeatureVector(\"aggregations-vector\", [\"aggregations.amount_avg_1d\"], with_indexes=True)\n", - " #get the categories list\n", - " unique_categories = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\",\"12\",\"13\",\"14\",\"15\",\"16\"]\n", - " # Use online feature service to get the latest average amount per category\n", - " with vector.get_online_feature_service() as online_feature_service:\n", - " resp = online_feature_service.get(\n", - " [{\"transaction_category\":cat} for cat in unique_categories]\n", - " )\n", - " return resp\n" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "84877799-a628-46db-baaf-970dfbd05a67", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'amount_avg_1d': 606.79, 'transaction_category': '0'},\n", - " {'amount_avg_1d': 17.0, 'transaction_category': '1'},\n", - " {'amount_avg_1d': 883.925, 'transaction_category': '2'},\n", - " {'amount_avg_1d': 128.59, 'transaction_category': '3'},\n", - " {'amount_avg_1d': 36.695, 'transaction_category': '4'},\n", - " {'amount_avg_1d': 115.35, 'transaction_category': '5'},\n", - " {'amount_avg_1d': 190.425, 'transaction_category': '6'},\n", - " {'amount_avg_1d': 35.76, 'transaction_category': '7'},\n", - " {'amount_avg_1d': 5400.805, 'transaction_category': '8'},\n", - " {'amount_avg_1d': 163.965, 'transaction_category': '9'},\n", - " {'amount_avg_1d': 123.33, 'transaction_category': '10'},\n", - " {'amount_avg_1d': 265.07500000000005, 'transaction_category': '11'},\n", - " {'amount_avg_1d': 14.575, 'transaction_category': '12'},\n", - " {'amount_avg_1d': 119.17, 'transaction_category': '13'},\n", - " {'amount_avg_1d': 674.905, 'transaction_category': '14'},\n", - " {'amount_avg_1d': 4964.49, 'transaction_category': '15'},\n", - " {'amount_avg_1d': 166.99, 'transaction_category': '16'}]" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "resp = get_realtime_transactions_aggregations()\n", - "resp" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "627561cb-ed85-4e8e-8f1a-10c312724886", - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_distances(resp, event):\n", - " for cat in resp:\n", - " transaction_category = cat['transaction_category'] \n", - " amount_avg = cat['amount_avg_1d']\n", - " event[0][\"dist_\" + transaction_category] = abs(amount_avg - event[0][\"amount\"])\n", - "\n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "7f0ca55e-6f74-4b4a-9abe-9cd95a8507ee", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'receiver_id': 4518551904499919,\n", - " 'sender_id': 4333582346477646,\n", - " 'amount': 833.26,\n", - " 'timestamp': Timestamp('2021-03-10 19:57:42'),\n", - " 'transaction_id': 0,\n", - " 'dist_0': 226.47000000000003,\n", - " 'dist_1': 816.26,\n", - " 'dist_2': 50.664999999999964,\n", - " 'dist_3': 704.67,\n", - " 'dist_4': 796.5649999999999,\n", - " 'dist_5': 717.91,\n", - " 'dist_6': 642.835,\n", - " 'dist_7': 797.5,\n", - " 'dist_8': 4567.545,\n", - " 'dist_9': 669.295,\n", - " 'dist_10': 709.93,\n", - " 'dist_11': 568.185,\n", - " 'dist_12': 818.685,\n", - " 'dist_13': 714.09,\n", - " 'dist_14': 158.35500000000002,\n", - " 'dist_15': 4131.23,\n", - " 'dist_16': 666.27}]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dist_event = calculate_distances(resp,first_event_data)\n", - "dist_event" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "350e295d-1723-4548-9704-b9f4003e544f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2021" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dist_event[0]['timestamp'].year" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "0ce33455-ec2a-4e34-a2eb-86f154eece84", - "metadata": {}, - "outputs": [], - "source": [ - "def convert_timestamp_to_components(event):\n", - " event[0][\"year\"] = event[0][\"timestamp\"].year\n", - " event[0][\"month\"] = event[0][\"timestamp\"].month\n", - " event[0][\"day\"] = event[0][\"timestamp\"].day\n", - " event[0][\"hour\"] = event[0][\"timestamp\"].hour\n", - " event[0][\"minute\"] = event[0][\"timestamp\"].minute\n", - " event[0][\"second\"] = event[0][\"timestamp\"].second\n", - " del event[0]['timestamp']\n", - "\n", - " return event\n" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "0471c226-00cf-40f0-9460-11ce4aaded9f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'receiver_id': 4518551904499919,\n", - " 'sender_id': 4333582346477646,\n", - " 'amount': 833.26,\n", - " 'transaction_id': 0,\n", - " 'dist_0': 226.47000000000003,\n", - " 'dist_1': 816.26,\n", - " 'dist_2': 50.664999999999964,\n", - " 'dist_3': 704.67,\n", - " 'dist_4': 796.5649999999999,\n", - " 'dist_5': 717.91,\n", - " 'dist_6': 642.835,\n", - " 'dist_7': 797.5,\n", - " 'dist_8': 4567.545,\n", - " 'dist_9': 669.295,\n", - " 'dist_10': 709.93,\n", - " 'dist_11': 568.185,\n", - " 'dist_12': 818.685,\n", - " 'dist_13': 714.09,\n", - " 'dist_14': 158.35500000000002,\n", - " 'dist_15': 4131.23,\n", - " 'dist_16': 666.27,\n", - " 'year': 2021,\n", - " 'month': 3,\n", - " 'day': 10,\n", - " 'hour': 19,\n", - " 'minute': 57,\n", - " 'second': 42}]" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extended_event = convert_timestamp_to_components(dist_event)\n", - "extended_event" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "22d5fc7a-f41c-4d41-b8b7-ebd084599f67", - "metadata": {}, - "outputs": [], - "source": [ - "def move_to_end(ls, key):\n", - " \"\"\"Move an item to the end of the dictionary.\"\"\"\n", - " d = ls[0]\n", - " if key in d:\n", - " value = d.pop(key) # Remove the item and get its value\n", - " d[key] = value # Reinsert the item, which moves it to the end\n", - " ls[0] = d\n", - " return ls" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "e1bba3af-ef47-45f7-987d-c6f57c819922", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'receiver_id': 4518551904499919,\n", - " 'sender_id': 4333582346477646,\n", - " 'amount': 833.26,\n", - " 'dist_0': 226.47000000000003,\n", - " 'dist_1': 816.26,\n", - " 'dist_2': 50.664999999999964,\n", - " 'dist_3': 704.67,\n", - " 'dist_4': 796.5649999999999,\n", - " 'dist_5': 717.91,\n", - " 'dist_6': 642.835,\n", - " 'dist_7': 797.5,\n", - " 'dist_8': 4567.545,\n", - " 'dist_9': 669.295,\n", - " 'dist_10': 709.93,\n", - " 'dist_11': 568.185,\n", - " 'dist_12': 818.685,\n", - " 'dist_13': 714.09,\n", - " 'dist_14': 158.35500000000002,\n", - " 'dist_15': 4131.23,\n", - " 'dist_16': 666.27,\n", - " 'year': 2021,\n", - " 'month': 3,\n", - " 'day': 10,\n", - " 'hour': 19,\n", - " 'minute': 57,\n", - " 'second': 42,\n", - " 'transaction_id': 0}]" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "restructured_event = move_to_end(extended_event,'transaction_id')\n", - "restructured_event" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "05e84a7d-8c43-4c0f-8f08-8b10ac3ce624", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[4518551904499919,\n", - " 4333582346477646,\n", - " 833.26,\n", - " 226.47000000000003,\n", - " 816.26,\n", - " 50.664999999999964,\n", - " 704.67,\n", - " 796.5649999999999,\n", - " 717.91,\n", - " 642.835,\n", - " 797.5,\n", - " 4567.545,\n", - " 669.295,\n", - " 709.93,\n", - " 568.185,\n", - " 818.685,\n", - " 714.09,\n", - " 158.35500000000002,\n", - " 4131.23,\n", - " 666.27,\n", - " 2021,\n", - " 3,\n", - " 10,\n", - " 19,\n", - " 57,\n", - " 42,\n", - " 0]" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "values_list = list(restructured_event[0].values())\n", - "values_list" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "f32ae384-7be1-4842-9754-695fb0f3fb32", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[4518551904499919,\n", - " 4333582346477646,\n", - " 833.26,\n", - " 226.47000000000003,\n", - " 816.26,\n", - " 50.664999999999964,\n", - " 704.67,\n", - " 796.5649999999999,\n", - " 717.91,\n", - " 642.835,\n", - " 797.5,\n", - " 4567.545,\n", - " 669.295,\n", - " 709.93,\n", - " 568.185,\n", - " 818.685,\n", - " 714.09,\n", - " 158.35500000000002,\n", - " 4131.23,\n", - " 666.27,\n", - " 2021,\n", - " 3,\n", - " 10,\n", - " 19,\n", - " 57,\n", - " 42,\n", - " 0]]" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "return_list = [values_list]\n", - "return_list" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "e4367c70-00d8-4a1e-a042-9bba154a7f17", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[4572835609402945.0,\n", - " 4036699444587678.5,\n", - " 1936.83,\n", - " 1417.285,\n", - " 1883.725,\n", - " 1838.2753142857143,\n", - " 1517.5099999999998,\n", - " 1917.2477777777776,\n", - " 1688.4639999999995,\n", - " 1418.1149999999975,\n", - " 1324.3757142857137,\n", - " 1506.1275,\n", - " 1081.8533333333314,\n", - " 1725.5830000000003,\n", - " 1936.83,\n", - " 1834.4686689419796,\n", - " 1900.4391666666663,\n", - " 1805.5211627906972,\n", - " 1844.352278481013,\n", - " 1884.5990322580644,\n", - " 5144.063333333338,\n", - " 1844.468,\n", - " 2021.0,\n", - " 2.0,\n", - " 24.0,\n", - " 2.0,\n", - " 13.0,\n", - " 10.0,\n", - " 44990.0]]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "samples = test_data.drop('transaction_category',axis=1)[:1].values.tolist()\n", - "samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f594383-8769-40c4-834d-4452ab5f58d0", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "response = server.test(body=samp_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5ba986f-cc05-47a0-ae85-5461258e86d6", - "metadata": {}, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "# response = server.test(body=samples)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85c8f63c-98b9-4977-a699-dc189aba8a51", - "metadata": {}, - "outputs": [], - "source": [ - "print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2d00c6d6-869d-45b0-a6f6-e516944299fb", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d87d1c46-a0fe-4d65-b069-d641088158bf", - "metadata": {}, - "outputs": [], - "source": [ - "unique_categories = [ 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "989e0376-ac28-4859-af7d-986bcf9f4b1d", - "metadata": {}, - "outputs": [], - "source": [ - "len('unique_categories')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95bb69b4-319c-4561-937d-5ef2bf100b8d", - "metadata": {}, - "outputs": [], - "source": [ - "# Define the list of features we will be using\n", - "features = ['aggregations.*']\n", - "\n", - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'aggregations-vector'\n", - "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "aggregations_fv = fstore.FeatureVector(fv_name, \n", - " features, \n", - " description='aggregation information')\n", - "\n", - "# Save the feature vector in the Feature Store\n", - "aggregations_fv.save()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88a45f5e-942c-467e-ba83-722f214ddead", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun.feature_store as fstore\n", - "\n", - "# Create a feature vector that gets the average amount\n", - "vector = fstore.FeatureVector(\"transactions_vector\", [\"aggregations.amount_avg_1d\"], with_indexes=True)\n", - "unique_categories = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\",\"12\",\"13\",\"14\",\"15\",\"16\"]\n", - "# Use online feature service to get the latest average amount per category\n", - "with vector.get_online_feature_service() as online_feature_service:\n", - " resp = online_feature_service.get(\n", - " [{\"transaction_category\":cat} for cat in unique_categories]\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "351feb39-9b3b-4dbe-9662-93c690a563fa", - "metadata": {}, - "outputs": [], - "source": [ - "print(resp)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ca44f54-4f9b-45e5-a18b-43697c7ea6cd", - "metadata": {}, - "outputs": [], - "source": [ - "for cat in resp:\n", - " transaction_category = cat['transaction_category']\n", - " amount_avg = cat['amount_avg_1d']\n", - " data[\"dist_\" + transaction_category] = abs(amount_avg - data[\"amount\"])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "smdemo", - "language": "python", - "name": "smdemo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/functions/serving.py b/src/functions/serving.py index 214239c..a04e126 100644 --- a/src/functions/serving.py +++ b/src/functions/serving.py @@ -31,6 +31,11 @@ def predict(self, body: dict) -> List: """Generate model predictions from sample.""" print(body) + # body['inputs'][0] = body['inputs'][0][1:] + + # print(body) + + # Convert input to numpy array: data = np.asarray(body["inputs"]) From 968995f12cdebdc53a9abf69a639bf4a9ae86312 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 12 Feb 2024 13:58:06 +0000 Subject: [PATCH 14/16] updating pipeline run --- financial-payment-pipeline.ipynb | 86 ++++++++++++++++---------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/financial-payment-pipeline.ipynb b/financial-payment-pipeline.ipynb index f9c0968..b159573 100644 --- a/financial-payment-pipeline.ipynb +++ b/financial-payment-pipeline.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "c447c260-b243-4f62-8a48-9dd07091282d", "metadata": { "editable": true, @@ -36,7 +36,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "> 2024-01-30 08:51:04,839 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" + "> 2024-02-12 13:44:28,481 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" ] } ], @@ -70,7 +70,7 @@ { "data": { "text/html": [ - "
Pipeline running (id=c5935692-c4e0-4bb9-92a7-b1d60e74bedd), click here to view the details in MLRun UI
" + "
Pipeline running (id=eb48cc6e-d6ae-4a2d-947f-600a6e4cd469), click here to view the details in MLRun UI
" ], "text/plain": [ "" @@ -85,52 +85,52 @@ "\n", "\n", - "\n", "\n", - "\n", + "\n", "\n", "kfp\n", - "\n", - "\n", + "\n", + "\n", "\n", - "fraud-detection-pipeline-6lj7n-2077403528\n", - "\n", - "\n", - "\n", - "\n", - "deploy-serving\n", + "fraud-detection-pipeline-cpzwr-2562321235\n", + "\n", + "train\n", "\n", - "\n", + "\n", "\n", - "fraud-detection-pipeline-6lj7n-3541775745\n", - "\n", - "evaluate\n", + "fraud-detection-pipeline-cpzwr-650189257\n", + "\n", + "\n", + "\n", + "\n", + "deploy-serving\n", "\n", - "\n", - "\n", - "fraud-detection-pipeline-6lj7n-4078108240\n", - "\n", - "train\n", - "\n", - "\n", + "\n", "\n", - "fraud-detection-pipeline-6lj7n-4078108240->fraud-detection-pipeline-6lj7n-2077403528\n", - "\n", - "\n", + "fraud-detection-pipeline-cpzwr-2562321235->fraud-detection-pipeline-cpzwr-650189257\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "fraud-detection-pipeline-cpzwr-2902630996\n", + "\n", + "evaluate\n", "\n", - "\n", + "\n", "\n", - "fraud-detection-pipeline-6lj7n-4078108240->fraud-detection-pipeline-6lj7n-3541775745\n", - "\n", - "\n", + "fraud-detection-pipeline-cpzwr-2562321235->fraud-detection-pipeline-cpzwr-2902630996\n", + "\n", + "\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -139,7 +139,7 @@ { "data": { "text/html": [ - "

Run Results

[info] Workflow c5935692-c4e0-4bb9-92a7-b1d60e74bedd finished, state=Succeeded


click the hyper links below to see detailed results
\n", + "

Run Results

[info] Workflow eb48cc6e-d6ae-4a2d-947f-600a6e4cd469 finished, state=Succeeded


click the hyper links below to see detailed results
\n", " \n", " \n", " \n", @@ -152,16 +152,16 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -180,7 +180,7 @@ { "data": { "text/plain": [ - "c5935692-c4e0-4bb9-92a7-b1d60e74bedd" + "eb48cc6e-d6ae-4a2d-947f-600a6e4cd469" ] }, "execution_count": 4, @@ -287,9 +287,9 @@ ], "metadata": { "kernelspec": { - "display_name": "mlrun-base", + "display_name": "smdemo", "language": "python", - "name": "conda-env-mlrun-base-py" + "name": "smdemo" }, "language_info": { "codemirror_mode": { @@ -301,7 +301,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.18" } }, "nbformat": 4, From 17437e5301ec03601e527dde5f478a3704c418e8 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 12 Feb 2024 14:01:29 +0000 Subject: [PATCH 15/16] deleting old versions --- financial_payment_classification_v2.ipynb | 2293 ------------- ...ncial_payment_classification_with_fs.ipynb | 2911 ----------------- ..._payment_classification_with_serving.ipynb | 2137 ------------ 3 files changed, 7341 deletions(-) delete mode 100644 financial_payment_classification_v2.ipynb delete mode 100644 financial_payment_classification_with_fs.ipynb delete mode 100644 financial_payment_classification_with_serving.ipynb diff --git a/financial_payment_classification_v2.ipynb b/financial_payment_classification_v2.ipynb deleted file mode 100644 index cb49af7..0000000 --- a/financial_payment_classification_v2.ipynb +++ /dev/null @@ -1,2293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "01b5c703", - "metadata": {}, - "source": [ - "# SageMaker Payment Classification \n" - ] - }, - { - "cell_type": "markdown", - "id": "6498f087", - "metadata": {}, - "source": [ - "---\n", - "\n", - "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", - "\n", - "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "id": "c2e49281", - "metadata": {}, - "source": [ - "\n", - "## Background \n", - "\n", - "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", - "\n", - "\n", - "## Notebook overview \n", - "\n", - "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", - "\n", - "## Dataset \n", - "\n", - "For this notebook we use a synthetic dataset. This dataset has the following features \n", - "\n", - "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", - "\n", - " 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances'\n", - "\n", - "\n", - "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", - "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", - "* __amount__: the amount which is transferred.\n", - "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", - "\n", - "\n", - "### 1. Setup \n", - "\n", - "Before we start we need to update the sagemaker library" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fff19d6b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", - "# !{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", - "# !{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-06 12:51:51,361 [info] Project loaded successfully: {'project_name': 'sagemaker-v2'}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v2\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b17a94d", - "metadata": {}, - "source": [ - "Now that we have the latest version we can import the libraries that we'll use in this notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "42c5d6d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n" - ] - } - ], - "source": [ - "import boto3\n", - "import io\n", - "import sagemaker\n", - "import time\n", - "import os\n", - "from sklearn.metrics import classification_report\n", - "import pandas as pd\n", - "import numpy as np\n", - "from datetime import datetime, timedelta" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", - "metadata": {}, - "outputs": [], - "source": [ - "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", - "metadata": {}, - "outputs": [], - "source": [ - "sess = sagemaker.Session()\n", - "write_bucket = sess.default_bucket()\n", - "write_prefix = \"sagemaker-app-lab\"" - ] - }, - { - "cell_type": "markdown", - "id": "3af7c33d", - "metadata": {}, - "source": [ - "Let's set the session variables to ensure that SageMaker is configured correctly." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c0e4db17", - "metadata": {}, - "outputs": [], - "source": [ - "region = sagemaker.Session().boto_region_name\n", - "sm_client = boto3.client(\"sagemaker\")\n", - "boto_session = boto3.Session(region_name=region)\n", - "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", - "role = sagemaker_role\n", - "bucket_prefix = \"payment-classification\"\n", - "s3_bucket = sagemaker_session.default_bucket()" - ] - }, - { - "cell_type": "markdown", - "id": "4fe6a975", - "metadata": {}, - "source": [ - "We define the factorize key which is used to map the '__transaction_category__' to numeric values" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "43946b9f", - "metadata": {}, - "outputs": [], - "source": [ - "factorize_key = {\n", - " \"Uncategorized\": 0,\n", - " \"Entertainment\": 1,\n", - " \"Education\": 2,\n", - " \"Shopping\": 3,\n", - " \"Personal Care\": 4,\n", - " \"Health and Fitness\": 5,\n", - " \"Food and Dining\": 6,\n", - " \"Gifts and Donations\": 7,\n", - " \"Investments\": 8,\n", - " \"Bills and Utilities\": 9,\n", - " \"Auto and Transport\": 10,\n", - " \"Travel\": 11,\n", - " \"Fees and Charges\": 12,\n", - " \"Business Services\": 13,\n", - " \"Personal Services\": 14,\n", - " \"Taxes\": 15,\n", - " \"Gambling\": 16,\n", - " \"Home\": 17,\n", - " \"Pension and insurances\": 18,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e3dc3c4", - "metadata": {}, - "source": [ - "### 2. Data preparation \n", - "\n", - "We ingest the simulated data from the public SageMaker S3 training database:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "5ff0d280", - "metadata": {}, - "outputs": [], - "source": [ - "s3 = boto3.client(\"s3\")\n", - "s3.download_file(\n", - " f\"sagemaker-example-files-prod-{region}\",\n", - " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", - " \"financial_transactions_mini.csv\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "08578d93", - "metadata": {}, - "source": [ - "Let's start by loading the dataset from our csv file into a Pandas dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a477abd7", - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cf6be447", - "metadata": {}, - "source": [ - "The dataframe looks as follows:\n", - "\n", - "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", - "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", - "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", - "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", - "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", - "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", - "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", - "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", - "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", - "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", - "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", - "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" - ] - }, - { - "cell_type": "markdown", - "id": "b5492919", - "metadata": {}, - "source": [ - "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", - "metadata": {}, - "outputs": [], - "source": [ - "for key, val in factorize_key.items():\n", - " factorize_key[key] = str(val)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ea2ebdd5", - "metadata": {}, - "outputs": [], - "source": [ - "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "fac2990c-fb9c-4d39-b02d-9477f55e4fcd", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "
uid
Jan 30 08:58:05Feb 12 13:51:31completedevaluate
model_path=store://artifacts/sagemaker-yoni/train_model_path@c5935692-c4e0-4bb9-92a7-b1d60e74bedd
model_name=xgboost-model
label_column=transaction_category
model_path=store://artifacts/sagemaker-admin/train_model_path@eb48cc6e-d6ae-4a2d-947f-600a6e4cd469
model_name=xgboost-model
label_column=transaction_category
Jan 30 08:51:15Feb 12 13:44:38completedtrain
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamounttimestamp
10604.601853e+154.274416e+15879.392021-01-01 15:07:52
37804.274544e+154.366884e+15628.012021-01-01 16:33:53
36804.601853e+154.161674e+1589.692021-01-01 18:17:29
1704.518552e+154.619387e+15222.012021-01-01 18:33:18
17804.274544e+154.456440e+15418.522021-01-01 19:33:31
..................
6993894.904096e+154.133603e+15124.082024-02-04 15:00:00
7059294.904096e+154.444087e+15188.662024-02-05 10:00:00
7037994.200241e+154.202495e+15139.272024-02-05 15:00:00
7046294.612985e+154.525455e+1512.492024-02-06 10:00:00
7167294.538817e+154.291294e+1557.032024-02-06 15:00:00
\n", - "

99997 rows × 5 columns

\n", - "" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount \\\n", - "106 0 4.601853e+15 4.274416e+15 879.39 \n", - "378 0 4.274544e+15 4.366884e+15 628.01 \n", - "368 0 4.601853e+15 4.161674e+15 89.69 \n", - "17 0 4.518552e+15 4.619387e+15 222.01 \n", - "178 0 4.274544e+15 4.456440e+15 418.52 \n", - "... ... ... ... ... \n", - "69938 9 4.904096e+15 4.133603e+15 124.08 \n", - "70592 9 4.904096e+15 4.444087e+15 188.66 \n", - "70379 9 4.200241e+15 4.202495e+15 139.27 \n", - "70462 9 4.612985e+15 4.525455e+15 12.49 \n", - "71672 9 4.538817e+15 4.291294e+15 57.03 \n", - "\n", - " timestamp \n", - "106 2021-01-01 15:07:52 \n", - "378 2021-01-01 16:33:53 \n", - "368 2021-01-01 18:17:29 \n", - "17 2021-01-01 18:33:18 \n", - "178 2021-01-01 19:33:31 \n", - "... ... \n", - "69938 2024-02-04 15:00:00 \n", - "70592 2024-02-05 10:00:00 \n", - "70379 2024-02-05 15:00:00 \n", - "70462 2024-02-06 10:00:00 \n", - "71672 2024-02-06 15:00:00 \n", - "\n", - "[99997 rows x 5 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day)\n", - "from utils import update_timestamps\n", - "data = update_timestamps(data)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", - "metadata": { - "tags": [] - }, - "source": [ - "### 3. Create feature store \n", - "\n", - "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " - ] - }, - { - "cell_type": "markdown", - "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", - "metadata": {}, - "source": [ - "#### feature-group-payment-classification" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "07fdb07a-f3b7-4255-b38b-17a939b8676d", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "_start->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "\n", - "# Create a feature set with the moving average (daily window\n", - "fset = fstore.FeatureSet(\n", - " \"aggregations\",\n", - " entities=[mlrun.features.Entity(\"transaction_category\")],\n", - " timestamp_key=\"timestamp\"\n", - ")\n", - "fset.add_aggregation(\"amount\", [\"avg\"], \"1d\")\n", - "fset.set_targets()\n", - "fset.graph.plot()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "e213e91e-276a-4cde-a2fc-059369cc837a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
amount_avg_1dreceiver_idsender_idamounttimestamp
transaction_category
0879.3900004.601853e+154.274416e+15879.392021-01-01 15:07:52
0753.7000004.274544e+154.366884e+15628.012021-01-01 16:33:53
0532.3633334.601853e+154.161674e+1589.692021-01-01 18:17:29
0454.7750004.518552e+154.619387e+15222.012021-01-01 18:33:18
0447.5240004.274544e+154.456440e+15418.522021-01-01 19:33:31
..................
9126.3550004.904096e+154.133603e+15124.082024-02-04 15:00:00
9188.6600004.904096e+154.444087e+15188.662024-02-05 10:00:00
9163.9650004.200241e+154.202495e+15139.272024-02-05 15:00:00
912.4900004.612985e+154.525455e+1512.492024-02-06 10:00:00
934.7600004.538817e+154.291294e+1557.032024-02-06 15:00:00
\n", - "

99997 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " amount_avg_1d receiver_id sender_id amount \\\n", - "transaction_category \n", - "0 879.390000 4.601853e+15 4.274416e+15 879.39 \n", - "0 753.700000 4.274544e+15 4.366884e+15 628.01 \n", - "0 532.363333 4.601853e+15 4.161674e+15 89.69 \n", - "0 454.775000 4.518552e+15 4.619387e+15 222.01 \n", - "0 447.524000 4.274544e+15 4.456440e+15 418.52 \n", - "... ... ... ... ... \n", - "9 126.355000 4.904096e+15 4.133603e+15 124.08 \n", - "9 188.660000 4.904096e+15 4.444087e+15 188.66 \n", - "9 163.965000 4.200241e+15 4.202495e+15 139.27 \n", - "9 12.490000 4.612985e+15 4.525455e+15 12.49 \n", - "9 34.760000 4.538817e+15 4.291294e+15 57.03 \n", - "\n", - " timestamp \n", - "transaction_category \n", - "0 2021-01-01 15:07:52 \n", - "0 2021-01-01 16:33:53 \n", - "0 2021-01-01 18:17:29 \n", - "0 2021-01-01 18:33:18 \n", - "0 2021-01-01 19:33:31 \n", - "... ... \n", - "9 2024-02-04 15:00:00 \n", - "9 2024-02-05 10:00:00 \n", - "9 2024-02-05 15:00:00 \n", - "9 2024-02-06 10:00:00 \n", - "9 2024-02-06 15:00:00 \n", - "\n", - "[99997 rows x 5 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_with_cat_avg = fset.ingest(data, return_df=True)\n", - "df_with_cat_avg" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "50441ed4-a228-44e7-87ce-024177b928f6", - "metadata": {}, - "outputs": [], - "source": [ - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "# create feature vector on top of aggreagations\n", - "# Define the list of features we will be using\n", - "features = ['aggregations.*']\n", - "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'aggreagations-vector'\n", - "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "aggregations_fv = fstore.FeatureVector(fv_name, \n", - " features, \n", - " description='stocks information')\n", - "\n", - "# Save the feature vector in the Feature Store\n", - "aggregations_fv.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "19081c06-240e-481b-bfe3-588bb77bd54e", - "metadata": {}, - "outputs": [], - "source": [ - "# Function that gets a dataframe, creates daily dates and group by the date and the given column.\n", - "# It returns a dataframe where the index is daily dates, columns are the categories, and each value is the last average for that category for a given day\n", - "def get_last_transaction_avg_per_day(df, column_to_groupby):\n", - " df['date'] = pd.to_datetime(df['timestamp']).dt.date\n", - " df = df.groupby(['date', df.index])[column_to_groupby].last()\n", - " df = df.unstack(fill_value=0)\n", - " return df\n", - "\n", - "\n", - "# Function that gets a dataframe and calculates a moving average per category and the distance between the row's amount to each category average\n", - "def add_grouped_features(data):\n", - "\n", - " df_with_cat_avg = data\n", - " df_with_cat_avg.sort_values([\"transaction_category\", \"timestamp\"], inplace=True)\n", - " \n", - " # Convert the timestamp to daily date and remove the aggregated average\n", - " df_with_cat_avg['date'] = pd.to_datetime(df_with_cat_avg['timestamp']).dt.date\n", - " df_without_cat_avg = df_with_cat_avg.drop(\"amount_avg_1d\", axis=1)\n", - " \n", - " # Get the daily average per transaction category\n", - " df_with_all_cat_avg = get_last_transaction_avg_per_day(df_with_cat_avg, 'amount_avg_1d')\n", - " \n", - " # Now let's join the 2 dataframes + calculate distance from average\n", - " unique_categories = df_without_cat_avg.index.unique()\n", - " df_without_cat_avg = df_without_cat_avg.reset_index()\n", - " \n", - " # Join the 2 dataframes\n", - " df_merged = pd.merge(df_without_cat_avg, df_with_all_cat_avg, on='date', how='outer')\n", - "\n", - " # For each transaction_category, calculate the distance and remove the category column\n", - " for col in unique_categories:\n", - " df_merged[\"dist_\" + col] = abs(df_merged[col] - df_merged[\"amount\"])\n", - " df_merged.drop(col, axis=1, inplace=True)\n", - " \n", - " # Split the timestamp into components\n", - " df_merged[\"year\"] = df_merged[\"timestamp\"].dt.year\n", - " df_merged[\"month\"] = df_merged[\"timestamp\"].dt.month\n", - " df_merged[\"day\"] = df_merged[\"timestamp\"].dt.day\n", - " df_merged[\"hour\"] = df_merged[\"timestamp\"].dt.hour\n", - " df_merged[\"minute\"] = df_merged[\"timestamp\"].dt.minute\n", - " df_merged[\"second\"] = df_merged[\"timestamp\"].dt.second\n", - "\n", - " del df_merged[\"timestamp\"]\n", - " del df_merged[\"date\"] \n", - " df_merged['transaction_id']= df_merged.reset_index().index \n", - " \n", - " return df_merged" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "8993721e-f0e5-4438-ab55-2f9bfb78e20a", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "add_grouped_features\n", - "\n", - "add_grouped_features\n", - "\n", - "\n", - "\n", - "_start->add_grouped_features\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet/parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "add_grouped_features->parquet/parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql/nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "add_grouped_features->nosql/nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "\n", - "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", - " entities=[fstore.Entity(\"transaction_id\")],\n", - " engine=\"pandas\",\n", - " description=\"transactions feature set\")\n", - "\n", - "# setting up the graph\n", - "# setting up the graph\n", - "extended_transactions_set.graph \\\n", - " .to(name=\"add_grouped_features\", handler=\"add_grouped_features\")\n", - "\n", - "\n", - "extended_transactions_set.set_targets()\n", - "\n", - "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-06 13:03:26,132 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamountdist_0dist_1dist_10dist_11dist_12dist_13...dist_7dist_8dist_9yearmonthdayhourminutesecondtransaction_id
004.601853e+154.274416e+15879.39490.66830.437308776.697953557.02625849.33375680.511538...842.6945456064.542857732.001765202111157520
104.274544e+154.366884e+15628.01239.28579.057308525.317953305.64625597.95375429.131538...591.3145456315.922857480.6217652021111633531
204.601853e+154.161674e+1589.69299.0440.73730813.002047232.6737559.63375109.188462...52.9945456854.24285757.6982352021111817292
304.518552e+154.619387e+15222.01166.72173.057308119.317953100.35375191.9537523.131538...185.3145456721.92285774.6217652021111833183
404.274544e+154.456440e+15418.5229.79369.567308315.82795396.15625388.46375219.641538...381.8245456525.412857271.1317652021111933314
..................................................................
9999294.735688e+154.925043e+15188.77188.77140.16494892.698188153.58500188.770002.936667...145.000476188.77000051.24458320214291662799992
9999394.419127e+154.035793e+15130.45130.4581.84494834.378188211.90500130.4500061.256667...86.680476130.4500007.075417202142916533199993
9999494.885580e+154.892613e+15249.82249.82201.214948153.74818892.53500249.8200058.113333...206.050476249.820000112.294583202142916575799994
9999594.538817e+154.853749e+15130.17130.1781.56494834.098188212.18500130.1700061.536667...86.400476130.1700007.3554172021429174499995
9999694.871261e+154.625081e+15135.34135.3486.73494839.268188207.01500135.3400056.366667...91.570476135.3400002.185417202142917102199996
\n", - "

99997 rows × 30 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount dist_0 \\\n", - "0 0 4.601853e+15 4.274416e+15 879.39 490.66 \n", - "1 0 4.274544e+15 4.366884e+15 628.01 239.28 \n", - "2 0 4.601853e+15 4.161674e+15 89.69 299.04 \n", - "3 0 4.518552e+15 4.619387e+15 222.01 166.72 \n", - "4 0 4.274544e+15 4.456440e+15 418.52 29.79 \n", - "... ... ... ... ... ... \n", - "99992 9 4.735688e+15 4.925043e+15 188.77 188.77 \n", - "99993 9 4.419127e+15 4.035793e+15 130.45 130.45 \n", - "99994 9 4.885580e+15 4.892613e+15 249.82 249.82 \n", - "99995 9 4.538817e+15 4.853749e+15 130.17 130.17 \n", - "99996 9 4.871261e+15 4.625081e+15 135.34 135.34 \n", - "\n", - " dist_1 dist_10 dist_11 dist_12 dist_13 ... \\\n", - "0 830.437308 776.697953 557.02625 849.33375 680.511538 ... \n", - "1 579.057308 525.317953 305.64625 597.95375 429.131538 ... \n", - "2 40.737308 13.002047 232.67375 59.63375 109.188462 ... \n", - "3 173.057308 119.317953 100.35375 191.95375 23.131538 ... \n", - "4 369.567308 315.827953 96.15625 388.46375 219.641538 ... \n", - "... ... ... ... ... ... ... \n", - "99992 140.164948 92.698188 153.58500 188.77000 2.936667 ... \n", - "99993 81.844948 34.378188 211.90500 130.45000 61.256667 ... \n", - "99994 201.214948 153.748188 92.53500 249.82000 58.113333 ... \n", - "99995 81.564948 34.098188 212.18500 130.17000 61.536667 ... \n", - "99996 86.734948 39.268188 207.01500 135.34000 56.366667 ... \n", - "\n", - " dist_7 dist_8 dist_9 year month day hour minute \\\n", - "0 842.694545 6064.542857 732.001765 2021 1 1 15 7 \n", - "1 591.314545 6315.922857 480.621765 2021 1 1 16 33 \n", - "2 52.994545 6854.242857 57.698235 2021 1 1 18 17 \n", - "3 185.314545 6721.922857 74.621765 2021 1 1 18 33 \n", - "4 381.824545 6525.412857 271.131765 2021 1 1 19 33 \n", - "... ... ... ... ... ... ... ... ... \n", - "99992 145.000476 188.770000 51.244583 2021 4 29 16 6 \n", - "99993 86.680476 130.450000 7.075417 2021 4 29 16 53 \n", - "99994 206.050476 249.820000 112.294583 2021 4 29 16 57 \n", - "99995 86.400476 130.170000 7.355417 2021 4 29 17 4 \n", - "99996 91.570476 135.340000 2.185417 2021 4 29 17 10 \n", - "\n", - " second transaction_id \n", - "0 52 0 \n", - "1 53 1 \n", - "2 29 2 \n", - "3 18 3 \n", - "4 31 4 \n", - "... ... ... \n", - "99992 27 99992 \n", - "99993 31 99993 \n", - "99994 57 99994 \n", - "99995 4 99995 \n", - "99996 21 99996 \n", - "\n", - "[99997 rows x 30 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "data = extended_transactions_set.ingest(df_with_cat_avg, overwrite=True)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "id": "b5e4834e", - "metadata": {}, - "source": [ - "We update the values in the feature store with the real values of our data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" - ] - }, - { - "cell_type": "markdown", - "id": "cf148985", - "metadata": {}, - "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" - ] - }, - { - "cell_type": "markdown", - "id": "289eeca6", - "metadata": {}, - "source": [ - "### 4. Create model \n", - "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", - "\n", - "\n", - "\n", - "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "bb4bdd8d", - "metadata": {}, - "outputs": [], - "source": [ - "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", - "train_data, validation_data, test_data = np.split(\n", - " data.sample(frac=1, random_state=42), [int(0.7 * len(data)), int(0.9 * len(data))]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f81f65b9", - "metadata": {}, - "source": [ - "We save these sets to a file." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "f849a7a9", - "metadata": {}, - "outputs": [], - "source": [ - "train_data.to_csv(\"train.csv\", index=False, header=False)\n", - "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", - "test_data.to_csv(\"test.csv\", index=False, header=True)" - ] - }, - { - "cell_type": "markdown", - "id": "de669936", - "metadata": {}, - "source": [ - "And upload these files to our s3 bucket" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "e1ca2543", - "metadata": {}, - "outputs": [], - "source": [ - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"train/train.csv\")\n", - ").upload_file(\"train.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", - ").upload_file(\"validation.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"test/test.csv\")\n", - ").upload_file(\"test.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "22de532f", - "metadata": {}, - "source": [ - "Get the XGBoost sagemaker image" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "a41b6a7d", - "metadata": {}, - "outputs": [], - "source": [ - "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" - ] - }, - { - "cell_type": "markdown", - "id": "66cae2a9", - "metadata": {}, - "source": [ - "Transform our data to a sagemaker input for training" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "e51c917a", - "metadata": {}, - "outputs": [], - "source": [ - "s3_input_train = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")\n", - "s3_input_validation = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6f2985d8", - "metadata": {}, - "source": [ - "We define the XGBoost model" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "92c1fe8c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb = sagemaker.estimator.Estimator(\n", - " container,\n", - " role,\n", - " instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", - " sagemaker_session=sagemaker_session,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ecafdfe8", - "metadata": {}, - "source": [ - "Set the parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "582adc6c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb.set_hyperparameters(\n", - " max_depth=5,\n", - " eta=0.2,\n", - " gamma=4,\n", - " min_child_weight=6,\n", - " subsample=0.8,\n", - " objective=\"multi:softprob\",\n", - " num_class=19,\n", - " verbosity=0,\n", - " num_round=100,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b36463dd", - "metadata": {}, - "source": [ - "And train the model" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "c24e06fc", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-06-13-03-44-059\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-02-06 13:03:44 Starting - Starting the training job...\n", - "2024-02-06 13:04:08 Starting - Preparing the instances for training.........\n", - "2024-02-06 13:05:26 Downloading - Downloading input data...\n", - "2024-02-06 13:05:56 Downloading - Downloading the training image......\n", - "2024-02-06 13:06:51 Training - Training image download completed. Training in progress..\u001b[34m[2024-02-06 13:07:07.762 ip-10-0-153-109.us-east-2.compute.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:07:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:08:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:08:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:08:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:08:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:08.056 ip-10-0-153-109.us-east-2.compute.internal:8 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:08.057 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:08.057 ip-10-0-153-109.us-east-2.compute.internal:8 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:08.058 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:08.058 ip-10-0-153-109.us-east-2.compute.internal:8 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-06:13:07:08:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.54515#011validation-merror:0.55430\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:10.007 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-06 13:07:10.012 ip-10-0-153-109.us-east-2.compute.internal:8 INFO hook.py:486] Hook is writing from the hook with pid: 8\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.53387#011validation-merror:0.54255\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.52198#011validation-merror:0.53050\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.51036#011validation-merror:0.52010\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.49936#011validation-merror:0.51095\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.49232#011validation-merror:0.50425\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.48936#011validation-merror:0.50210\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.48521#011validation-merror:0.49810\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.48034#011validation-merror:0.49275\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.47621#011validation-merror:0.48995\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.47151#011validation-merror:0.48500\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.46211#011validation-merror:0.47540\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.45770#011validation-merror:0.47160\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.45441#011validation-merror:0.46720\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.45021#011validation-merror:0.46235\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.44288#011validation-merror:0.45495\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.43809#011validation-merror:0.45070\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.43083#011validation-merror:0.44490\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.42683#011validation-merror:0.44065\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.41773#011validation-merror:0.43280\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.41412#011validation-merror:0.42900\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.40940#011validation-merror:0.42570\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.40558#011validation-merror:0.42220\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.40010#011validation-merror:0.41570\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.39509#011validation-merror:0.41130\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.39215#011validation-merror:0.40905\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.38077#011validation-merror:0.39840\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.37355#011validation-merror:0.39080\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.36949#011validation-merror:0.38705\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.36450#011validation-merror:0.38150\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.35094#011validation-merror:0.36650\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.34519#011validation-merror:0.35935\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.34140#011validation-merror:0.35690\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.33711#011validation-merror:0.35250\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.33434#011validation-merror:0.34945\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.32674#011validation-merror:0.34200\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.32153#011validation-merror:0.33760\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.31661#011validation-merror:0.33145\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.31099#011validation-merror:0.32515\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.30624#011validation-merror:0.31980\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.29983#011validation-merror:0.31405\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.29713#011validation-merror:0.31095\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.29291#011validation-merror:0.30635\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.28383#011validation-merror:0.29810\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.27650#011validation-merror:0.29000\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.26714#011validation-merror:0.27965\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.26255#011validation-merror:0.27560\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.25953#011validation-merror:0.27225\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.25214#011validation-merror:0.26700\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.24840#011validation-merror:0.26255\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.24452#011validation-merror:0.25870\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.24284#011validation-merror:0.25745\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.23650#011validation-merror:0.25165\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.23110#011validation-merror:0.24585\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.22725#011validation-merror:0.24250\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.22378#011validation-merror:0.23920\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.22085#011validation-merror:0.23610\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.21867#011validation-merror:0.23400\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.21477#011validation-merror:0.23010\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.21047#011validation-merror:0.22455\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.20769#011validation-merror:0.22170\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.20254#011validation-merror:0.21670\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.19889#011validation-merror:0.21330\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.19612#011validation-merror:0.21035\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.19432#011validation-merror:0.20815\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.18804#011validation-merror:0.20185\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.18431#011validation-merror:0.19805\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.18051#011validation-merror:0.19390\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.17619#011validation-merror:0.19000\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.17419#011validation-merror:0.18745\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.17288#011validation-merror:0.18590\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.17129#011validation-merror:0.18445\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.16491#011validation-merror:0.17750\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.16085#011validation-merror:0.17335\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.15854#011validation-merror:0.17080\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.15472#011validation-merror:0.16830\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.15298#011validation-merror:0.16705\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.15158#011validation-merror:0.16590\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.14944#011validation-merror:0.16430\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.14696#011validation-merror:0.16185\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.14485#011validation-merror:0.15990\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.14281#011validation-merror:0.15785\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.14082#011validation-merror:0.15545\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.14005#011validation-merror:0.15490\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.13581#011validation-merror:0.15075\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.13385#011validation-merror:0.14885\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.13242#011validation-merror:0.14735\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.12923#011validation-merror:0.14400\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.12895#011validation-merror:0.14380\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.12702#011validation-merror:0.14160\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.12629#011validation-merror:0.14090\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.12568#011validation-merror:0.14010\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.12215#011validation-merror:0.13690\u001b[0m\n", - "\n", - "2024-02-06 13:10:17 Uploading - Uploading generated training model\u001b[34m[93]#011train-merror:0.11968#011validation-merror:0.13450\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.11878#011validation-merror:0.13360\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.11785#011validation-merror:0.13240\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.11631#011validation-merror:0.13090\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.11509#011validation-merror:0.12975\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.11213#011validation-merror:0.12605\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.11039#011validation-merror:0.12445\u001b[0m\n", - "\n", - "2024-02-06 13:10:33 Completed - Training job completed\n", - "Training seconds: 308\n", - "Billable seconds: 308\n" - ] - } - ], - "source": [ - "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" - ] - }, - { - "cell_type": "markdown", - "id": "8b716cd7", - "metadata": {}, - "source": [ - "### 5. Using the endpoint \n", - "\n", - "Deploy the model to an endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-06-13-03-44-059/output/model.tar.gz'" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "xgb.model_data" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "78444d49-4ad3-49e4-a579-19b173facb26", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function = project.get_function(\"serving\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "preprocess\n", - "\n", - "preprocess\n", - "\n", - "\n", - "\n", - "_start->preprocess\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "preprocess->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "postprocess\n", - "\n", - "postprocess\n", - "\n", - "\n", - "\n", - "xgboost-model->postprocess\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set the topology and get the graph object:\n", - "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", - "\n", - "# Add the steps:\n", - "graph.to(handler=\"preprocess\", name=\"preprocess\") \\\n", - " .to(\"XGBModelServer\",\n", - " name=\"xgboost-model\",\n", - " model_path=xgb.model_data) \\\n", - " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", - "\n", - "# Plot to graph:\n", - "serving_function.plot(rankdir='LR')" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-06 13:10:58,760 [info] Starting remote function deploy\n", - "2024-02-06 13:10:59 (info) Deploying function\n", - "2024-02-06 13:10:59 (info) Building\n", - "2024-02-06 13:10:59 (info) Staging files and preparing base images\n", - "2024-02-06 13:10:59 (info) Building processor image\n", - "2024-02-06 13:12:04 (info) Build complete\n", - "Failed to deploy. Details:\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", - " self.load()\n", - " File \"/opt/nuclio/serving.py\", line 21, in load\n", - " model_file, extra_data = self.get_model(\".tar.gz\")\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", - " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", - " obj.download(temp_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", - " self._store.download(self._path, target_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", - " data = self.get(key)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", - " return obj.get()[\"Body\"].read()\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", - " response = action(self, *args, **kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", - " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", - " return self._make_api_call(operation_name, kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", - " raise error_class(parsed_response, operation_name)\n", - "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - " [worker_id=0]\n", - "Exception raised while running init_context [worker_id=0]\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 127, in _load_and_update_state\n", - " self.load()\n", - " File \"/opt/nuclio/serving.py\", line 21, in load\n", - " model_file, extra_data = self.get_model(\".tar.gz\")\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 197, in get_model\n", - " model_file, self.model_spec, extra_dataitems = mlrun.artifacts.get_model(\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/artifacts/model.py\", line 607, in get_model\n", - " obj.download(temp_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 473, in download\n", - " self._store.download(self._path, target_path)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/base.py\", line 154, in download\n", - " data = self.get(key)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/datastore/s3.py\", line 175, in get\n", - " return obj.get()[\"Body\"].read()\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/factory.py\", line 581, in do_action\n", - " response = action(self, *args, **kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/boto3/resources/action.py\", line 88, in __call__\n", - " response = getattr(parent.meta.client, operation_name)(*args, **params)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 553, in _api_call\n", - " return self._make_api_call(operation_name, kwargs)\n", - " File \"/opt/conda/lib/python3.9/site-packages/botocore/client.py\", line 1009, in _make_api_call\n", - " raise error_class(parsed_response, operation_name)\n", - "botocore.exceptions.ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 480, in \n", - " run_wrapper()\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 468, in run_wrapper\n", - " loop.run_until_complete(wrapper_instance.initialize())\n", - " File \"/opt/conda/lib/python3.9/asyncio/base_events.py\", line 647, in run_until_complete\n", - " return future.result()\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 165, in initialize\n", - " await self._initialize_context()\n", - " File \"/opt/nuclio/_nuclio_wrapper.py\", line 188, in _initialize_context\n", - " init_context_result = getattr(self._entrypoint_module, 'init_context')(self._context)\n", - " File \"/opt/nuclio/serving.py\", line 135, in init_context\n", - " nuclio_init_hook(context, globals(), 'serving_v2')\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/runtimes/nuclio.py\", line 34, in nuclio_init_hook\n", - " v2_serving_init(context, data)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 349, in v2_serving_init\n", - " serving_handler = server.init_object(namespace or get_caller_globals())\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/server.py\", line 192, in init_object\n", - " self.graph.init_object(self.context, namespace, self.load_mode, reset=True)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 917, in init_object\n", - " step.init_object(context, namespace, mode, reset=reset)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 444, in init_object\n", - " self._post_init(mode)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/states.py\", line 502, in _post_init\n", - " self._object.post_init(mode)\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 143, in post_init\n", - " self._load_and_update_state()\n", - " File \"/opt/conda/lib/python3.9/site-packages/mlrun/serving/v2_serving.py\", line 131, in _load_and_update_state\n", - " raise RuntimeError(f\"failed to load model {self.name}\") from exc\n", - "RuntimeError: failed to load model xgboost-model\n", - "> 2024-02-06 13:12:30,976 [error] Nuclio function failed to deploy: {'function_state': 'error'}\n" - ] - }, - { - "ename": "RunError", - "evalue": "Function serving deployment failed", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[31], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mproject\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mserving\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/project.py:3188\u001b[0m, in \u001b[0;36mMlrunProject.deploy_function\u001b[0;34m(self, function, dashboard, models, env, tag, verbose, builder_env, mock)\u001b[0m\n\u001b[1;32m 3166\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeploy_function\u001b[39m(\n\u001b[1;32m 3167\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 3168\u001b[0m function: typing\u001b[38;5;241m.\u001b[39mUnion[\u001b[38;5;28mstr\u001b[39m, mlrun\u001b[38;5;241m.\u001b[39mruntimes\u001b[38;5;241m.\u001b[39mBaseRuntime],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3175\u001b[0m mock: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 3176\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m typing\u001b[38;5;241m.\u001b[39mUnion[DeployStatus, kfp\u001b[38;5;241m.\u001b[39mdsl\u001b[38;5;241m.\u001b[39mContainerOp]:\n\u001b[1;32m 3177\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"deploy real-time (nuclio based) functions\u001b[39;00m\n\u001b[1;32m 3178\u001b[0m \n\u001b[1;32m 3179\u001b[0m \u001b[38;5;124;03m :param function: name of the function (in the project) or function object\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3186\u001b[0m \u001b[38;5;124;03m :param mock: deploy mock server vs a real Nuclio function (for local simulations)\u001b[39;00m\n\u001b[1;32m 3187\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mdeploy_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3189\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3190\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3192\u001b[0m \u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3194\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3195\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3196\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmock\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmock\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/projects/operations.py:395\u001b[0m, in \u001b[0;36mdeploy_function\u001b[0;34m(function, dashboard, models, env, tag, verbose, builder_env, project_object, mock)\u001b[0m\n\u001b[1;32m 388\u001b[0m function\u001b[38;5;241m.\u001b[39msave()\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 390\u001b[0m state\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 391\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMock\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname},\n\u001b[1;32m 392\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 393\u001b[0m )\n\u001b[0;32m--> 395\u001b[0m address \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# return object with the same outputs as the KFP op (allow using the same pipeline)\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DeployStatus(\n\u001b[1;32m 400\u001b[0m state\u001b[38;5;241m=\u001b[39mfunction\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mstate,\n\u001b[1;32m 401\u001b[0m outputs\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mendpoint\u001b[39m\u001b[38;5;124m\"\u001b[39m: address, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: function\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mnuclio_name},\n\u001b[1;32m 402\u001b[0m function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m 403\u001b[0m )\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/serving.py:647\u001b[0m, in \u001b[0;36mServingRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_deploy_function_refs()\n\u001b[1;32m 645\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeploy root function \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m ...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 647\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdeploy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mdashboard\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mauth_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuilder_env\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuilder_env\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 654\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_build\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_build\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 655\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:586\u001b[0m, in \u001b[0;36mRemoteRuntime.deploy\u001b[0;34m(self, dashboard, project, tag, verbose, auth_info, builder_env, force_build)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_credentials_from_remote_build(data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 584\u001b[0m \u001b[38;5;66;03m# when a function is deployed, we wait for it to be ready by default\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;66;03m# this also means that the function object will be updated with the function status\u001b[39;00m\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_function_deployment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 588\u001b[0m \u001b[38;5;66;03m# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort\u001b[39;00m\n\u001b[1;32m 589\u001b[0m \u001b[38;5;66;03m# now, functions can be not exposed (using service type ClusterIP) and hence\u001b[39;00m\n\u001b[1;32m 590\u001b[0m \u001b[38;5;66;03m# for BC we first try to populate the external invocation url, and then\u001b[39;00m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;66;03m# if not exists, take the internal invocation url\u001b[39;00m\n\u001b[1;32m 592\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus\u001b[38;5;241m.\u001b[39mexternal_invocation_urls:\n", - "File \u001b[0;32m~/.conda/envs/smdemo/lib/python3.9/site-packages/mlrun/runtimes/function.py:633\u001b[0m, in \u001b[0;36mRemoteRuntime._wait_for_function_deployment\u001b[0;34m(self, db, verbose)\u001b[0m\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m state \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mready\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 632\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNuclio function failed to deploy\u001b[39m\u001b[38;5;124m\"\u001b[39m, function_state\u001b[38;5;241m=\u001b[39mstate)\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RunError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFunction \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m deployment failed\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mRunError\u001b[0m: Function serving deployment failed" - ] - } - ], - "source": [ - "project.deploy_function(\"serving\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c858e3e9-9e43-4148-8015-6047565db456", - "metadata": {}, - "outputs": [], - "source": [ - "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", - "metadata": {}, - "outputs": [], - "source": [ - "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" - ] - }, - { - "cell_type": "markdown", - "id": "712f4d35", - "metadata": {}, - "source": [ - "### 6. Evaluate performance \n", - "\n", - "Run the model on our test data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e863ea7-5804-4637-b677-390c305cabfe", - "metadata": {}, - "outputs": [], - "source": [ - "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", - "metadata": {}, - "source": [ - "Add the evaluation function to our project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca4f7e49", - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_function = project.get_function(\"evaluate\")" - ] - }, - { - "cell_type": "markdown", - "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", - "metadata": {}, - "source": [ - "Run the evaluation job" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_run = evaluate_function.run(\n", - " handler=\"evaluate\",\n", - " params={\n", - " \"model_path\": xgb.model_data,\n", - " \"model_name\": \"xgboost-model\",\n", - " \"test_set\": s3_data,\n", - " \"label_column\": \"transaction_category\",\n", - " \"factorize_key\": factorize_key,\n", - " },\n", - " returns=[\"classification_report: dataset\"])" - ] - }, - { - "cell_type": "markdown", - "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", - "metadata": {}, - "source": [ - "See the evaluation result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_run.artifact(\"classification_report\").as_df()" - ] - }, - { - "cell_type": "markdown", - "id": "98d0b67e", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "You should see results similar to this:\n", - "\n", - "```\n", - " precision recall f1-score support\n", - "\n", - " Uncategorized 1.00 0.92 0.96 51\n", - " Entertainment 0.81 0.89 0.85 1486\n", - " Education 1.00 0.94 0.97 80\n", - " Shopping 0.86 0.94 0.90 3441\n", - " Personal Care 1.00 0.98 0.99 132\n", - " Health and Fitness 0.99 0.89 0.94 443\n", - " Food and Dining 0.99 0.82 0.90 918\n", - " Gifts and Donations 1.00 0.95 0.97 275\n", - " Investments 0.99 0.97 0.98 88\n", - " Bills and Utilities 1.00 0.99 1.00 332\n", - " Auto and Transport 0.94 0.84 0.88 1967\n", - " Travel 0.96 0.84 0.90 120\n", - " Fees and Charges 1.00 0.94 0.97 106\n", - " Business Services 1.00 0.99 1.00 146\n", - " Personal Services 1.00 0.96 0.98 75\n", - " Taxes 0.98 0.94 0.96 47\n", - " Gambling 1.00 1.00 1.00 15\n", - " Home 0.98 0.89 0.93 168\n", - "Pension and insurances 0.99 1.00 1.00 110\n", - "\n", - " accuracy 0.90 10000\n", - " macro avg 0.97 0.93 0.95 10000\n", - " weighted avg 0.91 0.90 0.90 10000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "49fdc82d", - "metadata": {}, - "source": [ - "### 7. Clean up \n", - "\n", - "Remove the feature group and endpoint to clean up" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f79b1164", - "metadata": {}, - "outputs": [], - "source": [ - "#feature_group.delete()\n", - "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" - ] - }, - { - "cell_type": "markdown", - "id": "e04b6fa6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Notebook CI Test Results\n", - "\n", - "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", - "\n", - "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "smdemo", - "language": "python", - "name": "smdemo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/financial_payment_classification_with_fs.ipynb b/financial_payment_classification_with_fs.ipynb deleted file mode 100644 index 7a5e1d6..0000000 --- a/financial_payment_classification_with_fs.ipynb +++ /dev/null @@ -1,2911 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "01b5c703", - "metadata": {}, - "source": [ - "# SageMaker Payment Classification \n" - ] - }, - { - "cell_type": "markdown", - "id": "6498f087", - "metadata": {}, - "source": [ - "---\n", - "\n", - "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", - "\n", - "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "id": "c2e49281", - "metadata": {}, - "source": [ - "\n", - "## Background \n", - "\n", - "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", - "\n", - "\n", - "## Notebook overview \n", - "\n", - "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", - "\n", - "## Dataset \n", - "\n", - "For this notebook we use a synthetic dataset. This dataset has the following features \n", - "\n", - "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", - "\n", - " 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances'\n", - "\n", - "\n", - "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", - "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", - "* __amount__: the amount which is transferred.\n", - "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", - "\n", - "\n", - "### 1. Setup \n", - "\n", - "Before we start we need to update the sagemaker library" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fff19d6b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# import sys\n", - "# !{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", - "# !{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", - "# !{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:21:51,304 [info] Identified pre-initialized git repo, using it: {'url': 'git://github.com/aviaIguazio/demo-sagemaker.git#refs/heads/development'}\n", - "> 2024-02-11 16:22:06,708 [info] Created and saved project: {'name': 'sagemaker-v3-admin', 'from_template': None, 'overwrite': False, 'context': './', 'save': True}\n", - "> 2024-02-11 16:22:07,592 [info] Project created successfully: {'project_name': 'sagemaker-v3', 'stored_in_db': True}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker-v3\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b17a94d", - "metadata": {}, - "source": [ - "Now that we have the latest version we can import the libraries that we'll use in this notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "42c5d6d0", - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import io\n", - "import sagemaker\n", - "import time\n", - "import os\n", - "from sklearn.metrics import classification_report\n", - "import pandas as pd\n", - "import numpy as np\n", - "from datetime import datetime, timedelta" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", - "metadata": {}, - "outputs": [], - "source": [ - "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", - "metadata": {}, - "outputs": [], - "source": [ - "sess = sagemaker.Session()\n", - "write_bucket = sess.default_bucket()\n", - "write_prefix = \"sagemaker-app-lab\"" - ] - }, - { - "cell_type": "markdown", - "id": "3af7c33d", - "metadata": {}, - "source": [ - "Let's set the session variables to ensure that SageMaker is configured correctly." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "c0e4db17", - "metadata": {}, - "outputs": [], - "source": [ - "region = sagemaker.Session().boto_region_name\n", - "sm_client = boto3.client(\"sagemaker\")\n", - "boto_session = boto3.Session(region_name=region)\n", - "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", - "role = sagemaker_role\n", - "bucket_prefix = \"payment-classification\"\n", - "s3_bucket = sagemaker_session.default_bucket()" - ] - }, - { - "cell_type": "markdown", - "id": "4fe6a975", - "metadata": {}, - "source": [ - "We define the factorize key which is used to map the '__transaction_category__' to numeric values" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "43946b9f", - "metadata": {}, - "outputs": [], - "source": [ - "factorize_key = {\n", - " \"Uncategorized\": 0,\n", - " \"Entertainment\": 1,\n", - " \"Education\": 2,\n", - " \"Shopping\": 3,\n", - " \"Personal Care\": 4,\n", - " \"Health and Fitness\": 5,\n", - " \"Food and Dining\": 6,\n", - " \"Gifts and Donations\": 7,\n", - " \"Investments\": 8,\n", - " \"Bills and Utilities\": 9,\n", - " \"Auto and Transport\": 10,\n", - " \"Travel\": 11,\n", - " \"Fees and Charges\": 12,\n", - " \"Business Services\": 13,\n", - " \"Personal Services\": 14,\n", - " \"Taxes\": 15,\n", - " \"Gambling\": 16,\n", - " \"Home\": 17,\n", - " \"Pension and insurances\": 18,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e3dc3c4", - "metadata": {}, - "source": [ - "### 2. Data preparation \n", - "\n", - "We ingest the simulated data from the public SageMaker S3 training database:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "5ff0d280", - "metadata": {}, - "outputs": [], - "source": [ - "s3 = boto3.client(\"s3\")\n", - "s3.download_file(\n", - " f\"sagemaker-example-files-prod-{region}\",\n", - " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", - " \"financial_transactions_mini.csv\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "08578d93", - "metadata": {}, - "source": [ - "Let's start by loading the dataset from our csv file into a Pandas dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a477abd7", - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cf6be447", - "metadata": {}, - "source": [ - "The dataframe looks as follows:\n", - "\n", - "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", - "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", - "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", - "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", - "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", - "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", - "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", - "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", - "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", - "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", - "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", - "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8c15f00d-8f89-41ec-aa22-f23fc394d1b4", - "metadata": {}, - "outputs": [], - "source": [ - "from utils import update_timestamps\n", - "data=update_timestamps(data)" - ] - }, - { - "cell_type": "markdown", - "id": "b5492919", - "metadata": {}, - "source": [ - "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5d577920-41e4-40f0-baaf-4e2f363dc227", - "metadata": {}, - "outputs": [], - "source": [ - "data['transaction_id']= data.reset_index().index " - ] - }, - { - "cell_type": "markdown", - "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", - "metadata": { - "tags": [] - }, - "source": [ - "### 3. Create feature store \n", - "\n", - "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " - ] - }, - { - "cell_type": "markdown", - "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", - "metadata": {}, - "source": [ - "#### feature-group-payment-classification" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "7422a9ca-91d5-4aa7-bd44-993e309e11f5", - "metadata": {}, - "outputs": [], - "source": [ - "# move category to the first column to match sagemaker label train convention\"\n", - "def pop_and_move_to_start(d, key):\n", - " # Pop the item if it exists, otherwise return None\n", - " value = d.pop(key, None)\n", - " if value is not None:\n", - " # Move the popped item to the start\n", - " d = {key: value, **d}\n", - " return d\n", - "\n", - "def calculate_category_distance(event): \n", - " column_name ='transaction_category_mapped'\n", - " event = pop_and_move_to_start(event,column_name)\n", - " category = event[column_name]\n", - " event['distance'] = abs(event['amount']-event['amount_avg_1d']) \n", - " return event" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "4101c303-2da3-431b-9375-9fa1747070af", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "DateExtractor\n", - "\n", - "DateExtractor\n", - "\n", - "\n", - "\n", - "_start->DateExtractor\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "MapValues\n", - "\n", - "MapValues\n", - "\n", - "\n", - "\n", - "DateExtractor->MapValues\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "Aggregates\n", - "\n", - "Aggregates\n", - "\n", - "\n", - "\n", - "MapValues->Aggregates\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "calculate_category_distance\n", - "\n", - "calculate_category_distance\n", - "\n", - "\n", - "\n", - "Aggregates->calculate_category_distance\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "DropFeatures\n", - "\n", - "DropFeatures\n", - "\n", - "\n", - "\n", - "calculate_category_distance->DropFeatures\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet/parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "DropFeatures->parquet/parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql/nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "DropFeatures->nosql/nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "from mlrun.feature_store.steps import OneHotEncoder, MapValues, DateExtractor, DropFeatures\n", - "\n", - "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", - " entities=[fstore.Entity(\"transaction_category\")],\n", - " description=\"transactions feature set\")\n", - "# setting up the graph\n", - "extended_transactions_set.graph \\\n", - " .to(DateExtractor(parts = ['year', 'month', 'day', 'hour','minute','second'], timestamp_col = 'timestamp')) \\\n", - " .to(MapValues({'transaction_category' : factorize_key}, with_original_features=True)) \\\n", - "\n", - "extended_transactions_set.add_aggregation(name='amount',column='amount',operations=['avg'],windows=['1d'],period='1h')\n", - "\n", - "extended_transactions_set.graph \\\n", - " .to(name=\"calculate_category_distance\", handler=\"calculate_category_distance\").after_step('Aggregates') \\\n", - " .to(DropFeatures(features=['timestamp']))\n", - "\n", - "\n", - "extended_transactions_set.set_targets()\n", - "\n", - "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53eb2151-447a-4eb0-be7f-a07f1cbea32d", - "metadata": {}, - "outputs": [], - "source": [ - "# Keeping every second row\n", - "df_kept = data.iloc[::2]\n", - "\n", - "# Or, to explicitly remove every second row (the opposite selection)\n", - "df_removed = data.drop(data.index[::2])\n", - "\n", - "\n", - "# Keeping every second row\n", - "df_kept = df_removed.iloc[::2]\n", - "\n", - "# Or, to explicitly remove every second row (the opposite selection)\n", - "df_removed_v2 = df_removed.drop(df_removed.index[::2])\n", - "\n", - "df_removed_v2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06c03ea5-8394-44ff-b81d-755e1c244269", - "metadata": {}, - "outputs": [], - "source": [ - "from utils import update_timestamps\n", - "df_removed_v2=update_timestamps(df_removed_v2)\n", - "df_removed_v2" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "1b6a6a84-fa0b-4db4-a3fc-aa02331718ed", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
transaction_category
Uncategorized0879.78000045185519044999194457298962882528879.783202445912270.000000
Uncategorized0840.50000047579519156690804655296518888015801.22720242611444939.280000
Uncategorized0701.43666745185519044999194910949333064003423.3111202431551621278.126667
Uncategorized0621.76000045185519044999194415760195692405382.7315202411712851239.030000
Uncategorized0519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
Pension and insurances18211.45363641796068600888494359198069543354302.10999792024380465790.646364
Pension and insurances18211.10739147515386207333054021524999937895115.89999832024431104295.217391
Pension and insurances18211.09285244050083552203244165276502284291207.089998720242101834304.012852
Pension and insurances18211.61259040921157888775434328901131757235355.5899991202441811537143.967410
Pension and insurances18211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category_mapped amount_avg_1d \\\n", - "transaction_category \n", - "Uncategorized 0 879.780000 \n", - "Uncategorized 0 840.500000 \n", - "Uncategorized 0 701.436667 \n", - "Uncategorized 0 621.760000 \n", - "Uncategorized 0 519.762000 \n", - "... ... ... \n", - "Pension and insurances 18 211.453636 \n", - "Pension and insurances 18 211.107391 \n", - "Pension and insurances 18 211.092852 \n", - "Pension and insurances 18 211.612590 \n", - "Pension and insurances 18 211.586237 \n", - "\n", - " receiver_id sender_id amount \\\n", - "transaction_category \n", - "Uncategorized 4518551904499919 4457298962882528 879.78 \n", - "Uncategorized 4757951915669080 4655296518888015 801.22 \n", - "Uncategorized 4518551904499919 4910949333064003 423.31 \n", - "Uncategorized 4518551904499919 4415760195692405 382.73 \n", - "Uncategorized 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... \n", - "Pension and insurances 4179606860088849 4359198069543354 302.10 \n", - "Pension and insurances 4751538620733305 4021524999937895 115.89 \n", - "Pension and insurances 4405008355220324 4165276502284291 207.08 \n", - "Pension and insurances 4092115788877543 4328901131757235 355.58 \n", - "Pension and insurances 4262047194499006 4017367486513464 204.26 \n", - "\n", - " transaction_id timestamp_year timestamp_month \\\n", - "transaction_category \n", - "Uncategorized 3 2024 4 \n", - "Uncategorized 7 2024 2 \n", - "Uncategorized 11 2024 3 \n", - "Uncategorized 15 2024 1 \n", - "Uncategorized 19 2024 1 \n", - "... ... ... ... \n", - "Pension and insurances 99979 2024 3 \n", - "Pension and insurances 99983 2024 4 \n", - "Pension and insurances 99987 2024 2 \n", - "Pension and insurances 99991 2024 4 \n", - "Pension and insurances 99995 2024 2 \n", - "\n", - " timestamp_day timestamp_hour timestamp_minute \\\n", - "transaction_category \n", - "Uncategorized 5 9 12 \n", - "Uncategorized 6 11 44 \n", - "Uncategorized 15 5 16 \n", - "Uncategorized 17 12 8 \n", - "Uncategorized 3 8 50 \n", - "... ... ... ... \n", - "Pension and insurances 8 0 46 \n", - "Pension and insurances 3 11 0 \n", - "Pension and insurances 10 18 34 \n", - "Pension and insurances 18 11 53 \n", - "Pension and insurances 11 16 23 \n", - "\n", - " timestamp_second distance \n", - "transaction_category \n", - "Uncategorized 27 0.000000 \n", - "Uncategorized 49 39.280000 \n", - "Uncategorized 21 278.126667 \n", - "Uncategorized 51 239.030000 \n", - "Uncategorized 57 407.992000 \n", - "... ... ... \n", - "Pension and insurances 57 90.646364 \n", - "Pension and insurances 42 95.217391 \n", - "Pension and insurances 30 4.012852 \n", - "Pension and insurances 7 143.967410 \n", - "Pension and insurances 15 7.326237 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ingested_data = extended_transactions_set.ingest(df_removed_v2, overwrite=True)\n", - "ingested_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6595564d-91a2-49c0-93e1-dc8ebb28467d", - "metadata": {}, - "outputs": [], - "source": [ - "#data = ingested_data.reset_index(drop=True)\n", - "data = ingested_data\n", - "#data = data[['transaction_category'] + [col for col in data.columns if col != 'transaction_category']]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "247a27f6-f5d4-4fca-aad7-91aaf2c204f3", - "metadata": {}, - "outputs": [], - "source": [ - "# Import MLRun's Feature Store\n", - "import mlrun.feature_store as fstore\n", - "\n", - "data_cols = list(data.columns)\n", - "# create feature vector on top of aggreagations\n", - "# Define the list of features we will be using\n", - "features = [f\"transactions.{name}\" for name in data_cols[1:]] \n", - "\n", - "\n", - "# Define the feature vector name for future reference\n", - "fv_name = 'transactions-vector'\n", - "\n", - "# Define the feature vector using our Feature Store (fstore)\n", - "transactions_fv = fstore.FeatureVector(fv_name, \n", - " features,\n", - " label_feature='transactions.transaction_category_mapped',\n", - " description='stocks information')\n", - "\n", - "#label_feature = 'transactions-v2.transaction_category',\n", - "# Save the feature vector in the Feature Store\n", - "transactions_fv.save()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "eb69d9fa-22a9-4b9f-9443-d00d9190ad55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
amount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistancetransaction_category_mapped
0879.78000045185519044999194457298962882528879.783202445912270.0000000
1840.50000047579519156690804655296518888015801.22720242611444939.2800000
2701.43666745185519044999194910949333064003423.3111202431551621278.1266670
3621.76000045185519044999194415760195692405382.7315202411712851239.0300000
4519.76200040980889806929744412940106031926111.771920241385057407.9920000
..........................................
24994211.45363641796068600888494359198069543354302.10999792024380465790.64636418
24995211.10739147515386207333054021524999937895115.89999832024431104295.21739118
24996211.09285244050083552203244165276502284291207.089998720242101834304.01285218
24997211.61259040921157888775434328901131757235355.5899991202441811537143.96741018
24998211.58623742620471944990064017367486513464204.269999520242111623157.32623718
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " amount_avg_1d receiver_id sender_id amount \\\n", - "0 879.780000 4518551904499919 4457298962882528 879.78 \n", - "1 840.500000 4757951915669080 4655296518888015 801.22 \n", - "2 701.436667 4518551904499919 4910949333064003 423.31 \n", - "3 621.760000 4518551904499919 4415760195692405 382.73 \n", - "4 519.762000 4098088980692974 4412940106031926 111.77 \n", - "... ... ... ... ... \n", - "24994 211.453636 4179606860088849 4359198069543354 302.10 \n", - "24995 211.107391 4751538620733305 4021524999937895 115.89 \n", - "24996 211.092852 4405008355220324 4165276502284291 207.08 \n", - "24997 211.612590 4092115788877543 4328901131757235 355.58 \n", - "24998 211.586237 4262047194499006 4017367486513464 204.26 \n", - "\n", - " transaction_id timestamp_year timestamp_month timestamp_day \\\n", - "0 3 2024 4 5 \n", - "1 7 2024 2 6 \n", - "2 11 2024 3 15 \n", - "3 15 2024 1 17 \n", - "4 19 2024 1 3 \n", - "... ... ... ... ... \n", - "24994 99979 2024 3 8 \n", - "24995 99983 2024 4 3 \n", - "24996 99987 2024 2 10 \n", - "24997 99991 2024 4 18 \n", - "24998 99995 2024 2 11 \n", - "\n", - " timestamp_hour timestamp_minute timestamp_second distance \\\n", - "0 9 12 27 0.000000 \n", - "1 11 44 49 39.280000 \n", - "2 5 16 21 278.126667 \n", - "3 12 8 51 239.030000 \n", - "4 8 50 57 407.992000 \n", - "... ... ... ... ... \n", - "24994 0 46 57 90.646364 \n", - "24995 11 0 42 95.217391 \n", - "24996 18 34 30 4.012852 \n", - "24997 11 53 7 143.967410 \n", - "24998 16 23 15 7.326237 \n", - "\n", - " transaction_category_mapped \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "... ... \n", - "24994 18 \n", - "24995 18 \n", - "24996 18 \n", - "24997 18 \n", - "24998 18 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fs\n", - "resp = transactions_fv.get_offline_features()\n", - "#Preview the dataset\n", - "fv_data = resp.to_dataframe()\n", - "fv_data" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "cb156ebe-9846-4ff3-a388-92362df7c741", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'amount_avg_1d': 211.58623655913973,\n", - " 'receiver_id': 4262047194499006,\n", - " 'sender_id': 4017367486513464,\n", - " 'amount': 204.26,\n", - " 'transaction_id': 99995,\n", - " 'timestamp_year': 2024,\n", - " 'timestamp_month': 2,\n", - " 'timestamp_day': 11,\n", - " 'timestamp_hour': 16,\n", - " 'timestamp_minute': 23,\n", - " 'timestamp_second': 15,\n", - " 'distance': 7.326236559139744}]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "svc = transactions_fv.get_online_feature_service()\n", - "resp = svc.get([{\"transaction_category\": \"Pension and insurances\"}])\n", - "resp" - ] - }, - { - "cell_type": "markdown", - "id": "b5e4834e", - "metadata": {}, - "source": [ - "We update the values in the feature store with the real values of our data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" - ] - }, - { - "cell_type": "markdown", - "id": "cf148985", - "metadata": {}, - "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" - ] - }, - { - "cell_type": "markdown", - "id": "289eeca6", - "metadata": {}, - "source": [ - "### 4. Create model \n", - "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", - "\n", - "\n", - "\n", - "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "1cbb00b5-46bf-4a20-aad9-a03716ab97ae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_category_mappedamount_avg_1dreceiver_idsender_idamounttransaction_idtimestamp_yeartimestamp_monthtimestamp_daytimestamp_hourtimestamp_minutetimestamp_seconddistance
00879.78000045185519044999194457298962882528879.783202445912270.000000
10840.50000047579519156690804655296518888015801.22720242611444939.280000
20701.43666745185519044999194910949333064003423.3111202431551621278.126667
30621.76000045185519044999194415760195692405382.7315202411712851239.030000
40519.76200040980889806929744412940106031926111.771920241385057407.992000
..........................................
2499418211.45363641796068600888494359198069543354302.10999792024380465790.646364
2499518211.10739147515386207333054021524999937895115.89999832024431104295.217391
2499618211.09285244050083552203244165276502284291207.089998720242101834304.012852
2499718211.61259040921157888775434328901131757235355.5899991202441811537143.967410
2499818211.58623742620471944990064017367486513464204.269999520242111623157.326237
\n", - "

24999 rows × 13 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category_mapped amount_avg_1d receiver_id \\\n", - "0 0 879.780000 4518551904499919 \n", - "1 0 840.500000 4757951915669080 \n", - "2 0 701.436667 4518551904499919 \n", - "3 0 621.760000 4518551904499919 \n", - "4 0 519.762000 4098088980692974 \n", - "... ... ... ... \n", - "24994 18 211.453636 4179606860088849 \n", - "24995 18 211.107391 4751538620733305 \n", - "24996 18 211.092852 4405008355220324 \n", - "24997 18 211.612590 4092115788877543 \n", - "24998 18 211.586237 4262047194499006 \n", - "\n", - " sender_id amount transaction_id timestamp_year \\\n", - "0 4457298962882528 879.78 3 2024 \n", - "1 4655296518888015 801.22 7 2024 \n", - "2 4910949333064003 423.31 11 2024 \n", - "3 4415760195692405 382.73 15 2024 \n", - "4 4412940106031926 111.77 19 2024 \n", - "... ... ... ... ... \n", - "24994 4359198069543354 302.10 99979 2024 \n", - "24995 4021524999937895 115.89 99983 2024 \n", - "24996 4165276502284291 207.08 99987 2024 \n", - "24997 4328901131757235 355.58 99991 2024 \n", - "24998 4017367486513464 204.26 99995 2024 \n", - "\n", - " timestamp_month timestamp_day timestamp_hour timestamp_minute \\\n", - "0 4 5 9 12 \n", - "1 2 6 11 44 \n", - "2 3 15 5 16 \n", - "3 1 17 12 8 \n", - "4 1 3 8 50 \n", - "... ... ... ... ... \n", - "24994 3 8 0 46 \n", - "24995 4 3 11 0 \n", - "24996 2 10 18 34 \n", - "24997 4 18 11 53 \n", - "24998 2 11 16 23 \n", - "\n", - " timestamp_second distance \n", - "0 27 0.000000 \n", - "1 49 39.280000 \n", - "2 21 278.126667 \n", - "3 51 239.030000 \n", - "4 57 407.992000 \n", - "... ... ... \n", - "24994 57 90.646364 \n", - "24995 42 95.217391 \n", - "24996 30 4.012852 \n", - "24997 7 143.967410 \n", - "24998 15 7.326237 \n", - "\n", - "[24999 rows x 13 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fs\n", - "resp = transactions_fv.get_offline_features()\n", - "#Preview the dataset\n", - "fv_data = resp.to_dataframe()\n", - "\n", - "column_to_move = 'transaction_category_mapped'\n", - "\n", - "new_columns_order = [column_to_move] + [col for col in fv_data.columns if col != column_to_move]\n", - "fv_data = fv_data[new_columns_order]\n", - "\n", - "\n", - "data = fv_data\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "47512de3-60ac-49c7-ace8-031959527e86", - "metadata": {}, - "outputs": [], - "source": [ - "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", - "train_data, validation_data, test_data = np.split(\n", - " fv_data.sample(frac=1, random_state=42), [int(0.7 * len(fv_data)), int(0.9 * len(fv_data))]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f81f65b9", - "metadata": {}, - "source": [ - "We save these sets to a file." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "f849a7a9", - "metadata": {}, - "outputs": [], - "source": [ - "train_data.to_csv(\"train.csv\", index=False, header=False)\n", - "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", - "test_data.to_csv(\"test.csv\", index=False, header=True)" - ] - }, - { - "cell_type": "markdown", - "id": "de669936", - "metadata": {}, - "source": [ - "And upload these files to our s3 bucket" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "e1ca2543", - "metadata": {}, - "outputs": [], - "source": [ - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"train/train.csv\")\n", - ").upload_file(\"train.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", - ").upload_file(\"validation.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"test/test.csv\")\n", - ").upload_file(\"test.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "22de532f", - "metadata": {}, - "source": [ - "Get the XGBoost sagemaker image" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "a41b6a7d", - "metadata": {}, - "outputs": [], - "source": [ - "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" - ] - }, - { - "cell_type": "markdown", - "id": "66cae2a9", - "metadata": {}, - "source": [ - "Transform our data to a sagemaker input for training" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "e51c917a", - "metadata": {}, - "outputs": [], - "source": [ - "s3_input_train = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")\n", - "s3_input_validation = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6f2985d8", - "metadata": {}, - "source": [ - "We define the XGBoost model" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "92c1fe8c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb = sagemaker.estimator.Estimator(\n", - " container,\n", - " role,\n", - " instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", - " sagemaker_session=sagemaker_session,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ecafdfe8", - "metadata": {}, - "source": [ - "Set the parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "582adc6c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb.set_hyperparameters(\n", - " max_depth=5,\n", - " eta=0.2,\n", - " gamma=4,\n", - " min_child_weight=6,\n", - " subsample=0.8,\n", - " objective=\"multi:softprob\",\n", - " num_class=19,\n", - " verbosity=0,\n", - " num_round=100,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b36463dd", - "metadata": {}, - "source": [ - "And train the model" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "c24e06fc", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-02-11-16-32-32-584\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-02-11 16:32:32 Starting - Starting the training job...\n", - "2024-02-11 16:32:46 Starting - Preparing the instances for training......\n", - "2024-02-11 16:33:57 Downloading - Downloading input data......\n", - "2024-02-11 16:34:37 Downloading - Downloading the training image...\n", - "2024-02-11 16:35:23 Training - Training image download completed. Training in progress...\u001b[34m[2024-02-11 16:35:36.980 ip-10-0-69-198.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Train matrix has 17499 rows and 12 columns\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Validation matrix has 5000 rows\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.099 ip-10-0-69-198.us-east-2.compute.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.100 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.101 ip-10-0-69-198.us-east-2.compute.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.102 ip-10-0-69-198.us-east-2.compute.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-02-11:16:35:37:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.00166#011validation-merror:0.00380\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.268 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-02-11 16:35:37.271 ip-10-0-69-198.us-east-2.compute.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.00126#011validation-merror:0.00320\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.00120#011validation-merror:0.00320\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.00109#011validation-merror:0.00280\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.00103#011validation-merror:0.00260\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.00063#011validation-merror:0.00180\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.00046#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.00040#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.00046#011validation-merror:0.00180\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.00029#011validation-merror:0.00140\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.00040#011validation-merror:0.00160\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.00023#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.00017#011validation-merror:0.00100\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.00017#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.00006#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[90]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.00011#011validation-merror:0.00120\u001b[0m\n", - "\n", - "2024-02-11 16:35:53 Uploading - Uploading generated training model\n", - "2024-02-11 16:36:04 Completed - Training job completed\n", - "Training seconds: 127\n", - "Billable seconds: 127\n" - ] - } - ], - "source": [ - "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" - ] - }, - { - "cell_type": "markdown", - "id": "8b716cd7", - "metadata": {}, - "source": [ - "### 5. Using the endpoint \n", - "\n", - "Deploy the model to an endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "042ae1c4-4ad3-42ba-883e-87fd7b1bcb20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz'" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "xgb.model_data" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "78444d49-4ad3-49e4-a579-19b173facb26", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function = project.get_function(\"serving\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "911457fa-812d-4991-a31c-4dfcb1593d3e", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function_v2 = project.set_function(\n", - " func=\"src/functions/serving.py\",\n", - " name=\"serving-v2\",\n", - " kind=\"serving\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2881c17d-dd84-43d7-acc7-83e40c8110d3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "_start->\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "graph = serving_function_v2.set_topology(\n", - " \"router\",\n", - " mlrun.serving.routers.EnrichmentModelRouter(\n", - " feature_vector_uri=transactions_fv.uri,\n", - " impute_policy={\"*\": \"$mean\"}),\n", - ")\n", - "serving_function_v2.add_model(\"xgboost-model\", class_name=\"XGBModelServer\", model_path=xgb.model_data)\n", - "\n", - "# Plot the ensemble configuration\n", - "serving_function_v2.spec.graph.plot()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "0ab0bcd2-5c70-4f48-bff9-d060f027e8e5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:36:45,242 [info] model xgboost-model was loaded\n", - "> 2024-02-11 16:36:45,243 [info] Loaded ['xgboost-model']\n" - ] - } - ], - "source": [ - "server = serving_function_v2.to_mock_server()" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "dd57cfcd-5878-4775-83ee-422dc2261ce8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'inputs': [[211.58623655913973, 4262047194499006, 4017367486513464, 204.26, 99995, 2024, 2, 11, 16, 23, 15, 7.326236559139744]]}\n" - ] - }, - { - "data": { - "text/plain": [ - "{'id': '9fca777838b34ecaa9a0978beb4c3324',\n", - " 'model_name': 'xgboost-model',\n", - " 'outputs': [[0.0006098453304730356,\n", - " 0.000491024402435869,\n", - " 0.0005141795263625681,\n", - " 0.0007783450419083238,\n", - " 0.0007057395414449275,\n", - " 0.0006167895044200122,\n", - " 0.0008293685968965292,\n", - " 0.0007642377750016749,\n", - " 0.0004749966901727021,\n", - " 0.0009146890370175242,\n", - " 0.0023798206821084023,\n", - " 0.0007584734121337533,\n", - " 0.0005588593194261193,\n", - " 0.0018726944690570235,\n", - " 0.001147600938566029,\n", - " 0.0010383735643699765,\n", - " 0.0010812224354594946,\n", - " 0.006694969721138477,\n", - " 0.9777687788009644]]}" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "response = server.test(body={'inputs':[['Pension and insurances']]})\n", - "response" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:39:18,386 [info] Starting remote function deploy\n", - "2024-02-11 16:39:18 (info) Deploying function\n", - "2024-02-11 16:39:18 (info) Building\n", - "2024-02-11 16:39:19 (info) Staging files and preparing base images\n", - "2024-02-11 16:39:19 (info) Building processor image\n", - "2024-02-11 16:40:24 (info) Build complete\n", - "2024-02-11 16:40:33 (info) Function deploy complete\n", - "> 2024-02-11 16:40:40,471 [info] Successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-v3-admin-serving-v2.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/']}\n" - ] - }, - { - "data": { - "text/plain": [ - "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/', 'name': 'sagemaker-v3-admin-serving-v2'})" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "project.deploy_function(\"serving-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "ac19dc03-01e2-4e29-ba75-a34804833d5c", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function_v2 = project.get_function(\"serving-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:40:40,546 [info] Invoking function: {'method': 'POST', 'path': 'http://sagemaker-v3-admin-serving-v2-sagemaker-v3-admin.default-tenant.app.cust-cs-il-353.iguazio-cd2.com//v2/models/xgboost-model/predict'}\n" - ] - } - ], - "source": [ - "response = serving_function_v2.invoke(path='/v2/models/xgboost-model/predict', body={\"inputs\": [['Pension and insurances']]})" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "57eeaddc-654a-41d2-bb51-4a9a787a3311", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '641c5971-c881-4d56-a326-8b02900be8db',\n", - " 'model_name': 'xgboost-model',\n", - " 'outputs': [[0.0006098453304730356,\n", - " 0.000491024402435869,\n", - " 0.0005141795263625681,\n", - " 0.0007783450419083238,\n", - " 0.0007057395414449275,\n", - " 0.0006167895044200122,\n", - " 0.0008293685968965292,\n", - " 0.0007642377750016749,\n", - " 0.0004749966901727021,\n", - " 0.0009146890370175242,\n", - " 0.0023798206821084023,\n", - " 0.0007584734121337533,\n", - " 0.0005588593194261193,\n", - " 0.0018726944690570235,\n", - " 0.001147600938566029,\n", - " 0.0010383735643699765,\n", - " 0.0010812224354594946,\n", - " 0.006694969721138477,\n", - " 0.9777687788009644]]}" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "response" - ] - }, - { - "cell_type": "markdown", - "id": "712f4d35", - "metadata": {}, - "source": [ - "### 6. Evaluate performance \n", - "\n", - "Run the model on our test data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35ff008b-f4e4-491b-b1e8-3b0a652c35fc", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "2e863ea7-5804-4637-b677-390c305cabfe", - "metadata": {}, - "outputs": [], - "source": [ - "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", - "metadata": {}, - "source": [ - "Add the evaluation function to our project" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "ca4f7e49", - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_function = project.get_function(\"evaluate\")" - ] - }, - { - "cell_type": "markdown", - "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", - "metadata": {}, - "source": [ - "Run the evaluation job" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:40:40,769 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': '77fb208c0816491e9f7ae69634b31b1b', 'db': 'https://mlrun-api.default-tenant.app.cust-cs-il-353.iguazio-cd2.com'}\n", - "> 2024-02-11 16:40:41,204 [info] Job is running in the background, pod: evaluate-evaluate-x5mzk\n", - "[16:40:45] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", - "configuration generated by an older version of XGBoost, please export the model by calling\n", - "`Booster.save_model` from that version first, then load it back in current version. See:\n", - "\n", - " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", - "\n", - "for more details about differences between saving model and serializing.\n", - "\n", - "> 2024-02-11 16:40:46,110 [info] To track results use the CLI: {'info_cmd': 'mlrun get run 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin', 'logs_cmd': 'mlrun logs 77fb208c0816491e9f7ae69634b31b1b -p sagemaker-v3-admin'}\n", - "> 2024-02-11 16:40:46,110 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.cust-cs-il-353.iguazio-cd2.com/mlprojects/sagemaker-v3-admin/jobs/monitor/77fb208c0816491e9f7ae69634b31b1b/overview'}\n", - "> 2024-02-11 16:40:46,110 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-v3-admin0Feb 11 16:40:44completedevaluate-evaluate
v3io_user=admin
kind=job
owner=admin
mlrun/client_version=1.6.0-rc26
mlrun/client_python_version=3.9.18
host=evaluate-evaluate-x5mzk
model_path=s3://sagemaker-us-east-2-934638699319/payment-classification/output/sagemaker-xgboost-2024-02-11-16-32-32-584/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-2-934638699319/payment-classification/test/test.csv
label_column=transaction_category_mapped
factorize_key={'Uncategorized': 0, 'Entertainment': 1, 'Education': 2, 'Shopping': 3, 'Personal Care': 4, 'Health and Fitness': 5, 'Food and Dining': 6, 'Gifts and Donations': 7, 'Investments': 8, 'Bills and Utilities': 9, 'Auto and Transport': 10, 'Travel': 11, 'Fees and Charges': 12, 'Business Services': 13, 'Personal Services': 14, 'Taxes': 15, 'Gambling': 16, 'Home': 17, 'Pension and insurances': 18}
classification_report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-02-11 16:40:54,323 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" - ] - } - ], - "source": [ - "evaluate_run = evaluate_function.run(\n", - " handler=\"evaluate\",\n", - " params={\n", - " \"model_path\": xgb.model_data,\n", - " \"model_name\": \"xgboost-model\",\n", - " \"test_set\": s3_data,\n", - " \"label_column\": \"transaction_category_mapped\",\n", - " \"factorize_key\": factorize_key,\n", - " },\n", - " returns=[\"classification_report: dataset\"])" - ] - }, - { - "cell_type": "markdown", - "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", - "metadata": {}, - "source": [ - "See the evaluation result" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
precisionrecallf1-scoresupport
Uncategorized1.0000001.0000001.0000008.0000
Entertainment1.0000001.0000001.000000362.0000
Education1.0000000.9285710.96296314.0000
Shopping0.9988391.0000000.999419860.0000
Personal Care1.0000001.0000001.00000023.0000
Health and Fitness1.0000001.0000001.000000123.0000
Food and Dining1.0000001.0000001.000000217.0000
Gifts and Donations1.0000001.0000001.00000068.0000
Investments0.9166671.0000000.95652222.0000
Bills and Utilities1.0000001.0000001.00000092.0000
Auto and Transport1.0000001.0000001.000000475.0000
Travel1.0000001.0000001.00000036.0000
Fees and Charges1.0000001.0000001.00000029.0000
Business Services1.0000001.0000001.00000047.0000
Personal Services1.0000001.0000001.00000024.0000
Taxes1.0000000.8461540.91666713.0000
Gambling1.0000001.0000001.0000008.0000
Home1.0000001.0000001.00000052.0000
Pension and insurances1.0000001.0000001.00000027.0000
accuracy0.9988000.9988000.9988000.9988
macro avg0.9955530.9881430.9913462500.0000
weighted avg0.9988670.9988000.9987772500.0000
\n", - "
" - ], - "text/plain": [ - " precision recall f1-score support\n", - "Uncategorized 1.000000 1.000000 1.000000 8.0000\n", - "Entertainment 1.000000 1.000000 1.000000 362.0000\n", - "Education 1.000000 0.928571 0.962963 14.0000\n", - "Shopping 0.998839 1.000000 0.999419 860.0000\n", - "Personal Care 1.000000 1.000000 1.000000 23.0000\n", - "Health and Fitness 1.000000 1.000000 1.000000 123.0000\n", - "Food and Dining 1.000000 1.000000 1.000000 217.0000\n", - "Gifts and Donations 1.000000 1.000000 1.000000 68.0000\n", - "Investments 0.916667 1.000000 0.956522 22.0000\n", - "Bills and Utilities 1.000000 1.000000 1.000000 92.0000\n", - "Auto and Transport 1.000000 1.000000 1.000000 475.0000\n", - "Travel 1.000000 1.000000 1.000000 36.0000\n", - "Fees and Charges 1.000000 1.000000 1.000000 29.0000\n", - "Business Services 1.000000 1.000000 1.000000 47.0000\n", - "Personal Services 1.000000 1.000000 1.000000 24.0000\n", - "Taxes 1.000000 0.846154 0.916667 13.0000\n", - "Gambling 1.000000 1.000000 1.000000 8.0000\n", - "Home 1.000000 1.000000 1.000000 52.0000\n", - "Pension and insurances 1.000000 1.000000 1.000000 27.0000\n", - "accuracy 0.998800 0.998800 0.998800 0.9988\n", - "macro avg 0.995553 0.988143 0.991346 2500.0000\n", - "weighted avg 0.998867 0.998800 0.998777 2500.0000" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluate_run.artifact(\"classification_report\").as_df()" - ] - }, - { - "cell_type": "markdown", - "id": "98d0b67e", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "You should see results similar to this:\n", - "\n", - "```\n", - " precision recall f1-score support\n", - "\n", - " Uncategorized 1.00 0.92 0.96 51\n", - " Entertainment 0.81 0.89 0.85 1486\n", - " Education 1.00 0.94 0.97 80\n", - " Shopping 0.86 0.94 0.90 3441\n", - " Personal Care 1.00 0.98 0.99 132\n", - " Health and Fitness 0.99 0.89 0.94 443\n", - " Food and Dining 0.99 0.82 0.90 918\n", - " Gifts and Donations 1.00 0.95 0.97 275\n", - " Investments 0.99 0.97 0.98 88\n", - " Bills and Utilities 1.00 0.99 1.00 332\n", - " Auto and Transport 0.94 0.84 0.88 1967\n", - " Travel 0.96 0.84 0.90 120\n", - " Fees and Charges 1.00 0.94 0.97 106\n", - " Business Services 1.00 0.99 1.00 146\n", - " Personal Services 1.00 0.96 0.98 75\n", - " Taxes 0.98 0.94 0.96 47\n", - " Gambling 1.00 1.00 1.00 15\n", - " Home 0.98 0.89 0.93 168\n", - "Pension and insurances 0.99 1.00 1.00 110\n", - "\n", - " accuracy 0.90 10000\n", - " macro avg 0.97 0.93 0.95 10000\n", - " weighted avg 0.91 0.90 0.90 10000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "49fdc82d", - "metadata": {}, - "source": [ - "### 7. Clean up \n", - "\n", - "Remove the feature group and endpoint to clean up" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f79b1164", - "metadata": {}, - "outputs": [], - "source": [ - "#feature_group.delete()\n", - "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" - ] - }, - { - "cell_type": "markdown", - "id": "e04b6fa6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Notebook CI Test Results\n", - "\n", - "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", - "\n", - "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "smdemo", - "language": "python", - "name": "smdemo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/financial_payment_classification_with_serving.ipynb b/financial_payment_classification_with_serving.ipynb deleted file mode 100644 index 31eb690..0000000 --- a/financial_payment_classification_with_serving.ipynb +++ /dev/null @@ -1,2137 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "01b5c703", - "metadata": {}, - "source": [ - "# SageMaker Payment Classification \n" - ] - }, - { - "cell_type": "markdown", - "id": "6498f087", - "metadata": {}, - "source": [ - "---\n", - "\n", - "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", - "\n", - "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "id": "c2e49281", - "metadata": {}, - "source": [ - "\n", - "## Background \n", - "\n", - "This notebook demonstrates how you can train and deploy a machine learning model to classify payment transactions. Enriching financial transactions with the category of the transaction. This can be used as an intermediate step in fraud detection, personalization or anomaly detection. As well as a method to provide end users (e.g. customers at a bank) with an overview of their spending habits. Amazon SageMaker can be used to train and deploy a XGBoost model, as well as the required underlying infrastructure. For this notebook a generated dataset is used where a payment consists of mostly an amount, sender, receiver and timestamp.\n", - "\n", - "\n", - "## Notebook overview \n", - "\n", - "This notebook consists of seven parts. First, we import and configure the required libraries. After that we prepare the data used in this example and create the feature store. With the newly created features we create a XGBoost model. An endpoint is created to host this model. We evaluate the performance of the model and end by cleaning up the used resources.\n", - "\n", - "## Dataset \n", - "\n", - "For this notebook we use a synthetic dataset. This dataset has the following features \n", - "\n", - "* __transaction_category__: The category of the transaction, this is one of the next 19 options.\n", - "\n", - " 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances'\n", - "\n", - "\n", - "* __receiver_id__: an identifier for the receiving party. The identifier consist of 16 numbers.\n", - "* __sender_id__: an identifier for the sending party. The identifier consist of 16 numbers.\n", - "* __amount__: the amount which is transferred.\n", - "* __timestamp__: the timestamp of the transaction in YYYY-MM-DD HH:MM:SS format.\n", - "\n", - "\n", - "### 1. Setup \n", - "\n", - "Before we start we need to update the sagemaker library" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fff19d6b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import sys\n", - "!{sys.executable} -m pip install --upgrade pip --quiet # upgrade pip to the latest vesion\n", - "!{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion\n", - "!{sys.executable} -m pip install --upgrade boto --quiet # upgrade boto to the latest vesion" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "32a9c9d4-1515-4d8e-ad4c-e2f88544e67f", - "metadata": {}, - "outputs": [], - "source": [ - "import mlrun" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1243f1c6-8043-41f6-a64f-1da7017622ac", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:10:06,832 [info] Project loaded successfully: {'project_name': 'sagemaker'}\n" - ] - } - ], - "source": [ - "project = mlrun.get_or_create_project(\n", - " name=\"sagemaker\", \n", - " user_project=True,\n", - " parameters={\n", - " # \"source\" : \"git://github.com/mlrun/demo-sagemaker#main\",\n", - " \"default_image\" : \"yonishelach/sagemaker-demo\"\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "1b17a94d", - "metadata": {}, - "source": [ - "Now that we have the latest version we can import the libraries that we'll use in this notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "42c5d6d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /User/.config/sagemaker/config.yaml\n" - ] - } - ], - "source": [ - "import boto3\n", - "import io\n", - "import sagemaker\n", - "import time\n", - "import os\n", - "from time import sleep\n", - "from sklearn.metrics import classification_report\n", - "from sagemaker.feature_store.feature_group import FeatureGroup\n", - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "6406c0df-e745-4e3d-ad98-7d4504ff8b07", - "metadata": {}, - "outputs": [], - "source": [ - "sagemaker_role = os.environ[\"SAGEMAKER-ROLE\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b0f0ea71-1c48-4174-a0bd-e1b4c0137d25", - "metadata": {}, - "outputs": [], - "source": [ - "sess = sagemaker.Session()\n", - "write_bucket = sess.default_bucket()\n", - "write_prefix = \"sagemaker-app-lab\"" - ] - }, - { - "cell_type": "markdown", - "id": "3af7c33d", - "metadata": {}, - "source": [ - "Let's set the session variables to ensure that SageMaker is configured correctly." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c0e4db17", - "metadata": {}, - "outputs": [], - "source": [ - "region = sagemaker.Session().boto_region_name\n", - "sm_client = boto3.client(\"sagemaker\")\n", - "boto_session = boto3.Session(region_name=region)\n", - "sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)\n", - "#role = sagemaker.get_execution_role()\n", - "role = sagemaker_role\n", - "bucket_prefix = \"payment-classification\"\n", - "s3_bucket = sagemaker_session.default_bucket()" - ] - }, - { - "cell_type": "markdown", - "id": "4fe6a975", - "metadata": {}, - "source": [ - "We define the factorize key which is used to map the '__transaction_category__' to numeric values" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "43946b9f", - "metadata": {}, - "outputs": [], - "source": [ - "factorize_key = {\n", - " \"Uncategorized\": 0,\n", - " \"Entertainment\": 1,\n", - " \"Education\": 2,\n", - " \"Shopping\": 3,\n", - " \"Personal Care\": 4,\n", - " \"Health and Fitness\": 5,\n", - " \"Food and Dining\": 6,\n", - " \"Gifts and Donations\": 7,\n", - " \"Investments\": 8,\n", - " \"Bills and Utilities\": 9,\n", - " \"Auto and Transport\": 10,\n", - " \"Travel\": 11,\n", - " \"Fees and Charges\": 12,\n", - " \"Business Services\": 13,\n", - " \"Personal Services\": 14,\n", - " \"Taxes\": 15,\n", - " \"Gambling\": 16,\n", - " \"Home\": 17,\n", - " \"Pension and insurances\": 18,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "5e3dc3c4", - "metadata": {}, - "source": [ - "### 2. Data preparation \n", - "\n", - "We ingest the simulated data from the public SageMaker S3 training database:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "5ff0d280", - "metadata": {}, - "outputs": [], - "source": [ - "s3 = boto3.client(\"s3\")\n", - "s3.download_file(\n", - " f\"sagemaker-example-files-prod-{region}\",\n", - " \"datasets/tabular/synthetic_financial/financial_transactions_mini.csv\",\n", - " \"financial_transactions_mini.csv\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "08578d93", - "metadata": {}, - "source": [ - "Let's start by loading the dataset from our csv file into a Pandas dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a477abd7", - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(\n", - " \"financial_transactions_mini.csv\",\n", - " parse_dates=[\"timestamp\"],\n", - " infer_datetime_format=True,\n", - " dtype={\"transaction_category\": \"string\"},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "cf6be447", - "metadata": {}, - "source": [ - "The dataframe looks as follows:\n", - "\n", - "| | transaction_category | receiver_id | sender_id | amount | timestamp |\n", - "|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|\n", - "| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |\n", - "| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |\n", - "| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |\n", - "| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |\n", - "| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |\n", - "| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |\n", - "| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |\n", - "| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |\n", - "| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |\n", - "| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |" - ] - }, - { - "cell_type": "markdown", - "id": "b5492919", - "metadata": {}, - "source": [ - "Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "24f6090e", - "metadata": {}, - "outputs": [], - "source": [ - "data[\"year\"] = data[\"timestamp\"].dt.year\n", - "data[\"month\"] = data[\"timestamp\"].dt.month\n", - "data[\"day\"] = data[\"timestamp\"].dt.day\n", - "data[\"hour\"] = data[\"timestamp\"].dt.hour\n", - "data[\"minute\"] = data[\"timestamp\"].dt.minute\n", - "data[\"second\"] = data[\"timestamp\"].dt.second\n", - "\n", - "del data[\"timestamp\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "79b0854f-c209-4092-ac0f-a680f35c2c74", - "metadata": {}, - "outputs": [], - "source": [ - "for key, val in factorize_key.items():\n", - " factorize_key[key] = str(val)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "0ee06b1d-0cfb-4242-a7e7-2443a0377d99", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - "[ 'Uncategorized', 'Entertainment', 'Education',\n", - " 'Shopping', 'Personal Care', 'Health and Fitness',\n", - " 'Food and Dining', 'Gifts and Donations', 'Investments',\n", - " 'Bills and Utilities', 'Auto and Transport', 'Travel',\n", - " 'Fees and Charges', 'Business Services', 'Personal Services',\n", - " 'Taxes', 'Gambling', 'Home',\n", - " 'Pension and insurances']\n", - "Length: 19, dtype: string" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data[\"transaction_category\"].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "f7314f8a", - "metadata": {}, - "source": [ - "We'll transform the transaction categories to numeric targets for the classification by factorization." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "ea2ebdd5", - "metadata": {}, - "outputs": [], - "source": [ - "data[\"transaction_category\"] = data[\"transaction_category\"].replace(factorize_key)" - ] - }, - { - "cell_type": "markdown", - "id": "ce6d007f-4be8-47be-a993-6c0e6a217d86", - "metadata": { - "tags": [] - }, - "source": [ - "### 3. Create feature store \n", - "\n", - "To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). " - ] - }, - { - "cell_type": "markdown", - "id": "7fa840f3-e226-4e6a-9159-748b5dd77f8d", - "metadata": {}, - "source": [ - "#### feature-group-payment-classification" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "3c621044-681a-4e1a-9968-f637ed992539", - "metadata": {}, - "outputs": [], - "source": [ - "def add_grouped_features(df):\n", - " feature_store_data = pd.DataFrame()\n", - " feature_store_data[\"mean_amount\"] = df.groupby([\"transaction_category\"]).mean()[\"amount\"]\n", - " feature_store_data[\"count\"] = df.groupby([\"transaction_category\"]).count()[\"amount\"]\n", - " feature_store_data[\"identifier\"] = feature_store_data.index\n", - " feature_store_data[\"EventTime\"] = time.time()\n", - " \n", - " \n", - " \n", - " additional_features = pd.pivot_table(\n", - " feature_store_data, values=[\"mean_amount\"], index=[\"identifier\"]\n", - " ).T.add_prefix(\"dist_\")\n", - " additional_features_columns = list(additional_features.columns)\n", - " df2 = df.copy()\n", - " df2 = pd.concat([df2, pd.DataFrame(columns=additional_features_columns, dtype=object)])\n", - " df2[additional_features_columns] = additional_features.values[0]\n", - " for col in additional_features_columns:\n", - " df2[col] = abs(df2[col] - df2[\"amount\"]) \n", - " df2['transaction_id']= df2.reset_index().index \n", - " return df2" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "c71af4a9-f2d8-40ca-b0bf-3ef67c5b69d9", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "add_grouped_features\n", - "\n", - "add_grouped_features\n", - "\n", - "\n", - "\n", - "_start->add_grouped_features\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "parquet/parquet\n", - "\n", - "\n", - "parquet\n", - "\n", - "\n", - "\n", - "add_grouped_features->parquet/parquet\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "nosql/nosql\n", - "\n", - "\n", - "nosql\n", - "\n", - "\n", - "\n", - "add_grouped_features->nosql/nosql\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "from mlrun.datastore.targets import ParquetTarget\n", - "\n", - "# creating feature set\n", - "extended_transactions_set = fstore.FeatureSet(\"transactions\",\n", - " entities=[fstore.Entity(\"transaction_id\")],\n", - " engine=\"pandas\",\n", - " description=\"transactions feature set\")\n", - "\n", - "# setting up the graph\n", - "extended_transactions_set.graph \\\n", - " .to(name='add_grouped_features', handler='add_grouped_features')\n", - " # Add aggregations for 2, 12, and 24 hour time windows\n", - " \n", - " \n", - "\n", - "\n", - "extended_transactions_set.set_targets()\n", - "\n", - "extended_transactions_set.plot(rankdir=\"LR\", with_targets=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "2085e0a9-56e1-4641-a4a6-64e2124d9c15", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:11:30,483 [warning] Overriding type of entity 'transaction_id' from 'str' to 'int'. This may result in errors or unusable data.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
transaction_categoryreceiver_idsender_idamountyearmonthdayhourminutesecond...dist_18dist_2dist_3dist_4dist_5dist_6dist_7dist_8dist_9transaction_id
004.518552e+154.333582e+15833.262021.03.010.019.057.042.0...627.80284917.893495732.342497801.755964713.663595740.010607782.1875535191.287484718.4804420
104.518552e+154.642413e+15596.632021.02.011.017.053.032.0...391.172849254.523495495.712497565.125964477.033595503.380607545.5575535427.917484481.8504421
204.274544e+154.952666e+15176.762021.02.021.018.029.032.0...28.697151674.39349575.842497145.25596457.16359583.510607125.6875535847.78748461.9804422
304.518552e+154.457299e+15879.782021.04.09.016.014.019.0...674.32284928.626505778.862497848.275964760.183595786.530607828.7075535144.767484765.0004423
404.601853e+154.578126e+15742.252021.04.04.015.050.016.0...536.792849108.903495641.332497710.745964622.653595649.000607691.1775535282.297484627.4704424
..................................................................
99992184.405008e+154.583356e+15205.432021.04.020.012.023.053.0...0.027151645.723495104.512497173.92596485.833595112.180607154.3575535819.11748490.65044299992
99993184.300417e+154.949241e+15151.492021.03.024.019.030.018.0...53.967151699.66349550.572497119.98596431.89359558.240607100.4175535873.05748436.71044299993
99994184.405008e+154.996896e+15188.282021.03.08.019.051.010.0...17.177151662.87349587.362497156.77596468.68359595.030607137.2075535836.26748473.50044299994
99995184.262047e+154.017367e+15204.262021.02.014.023.025.07.0...1.197151646.893495103.342497172.75596484.663595111.010607153.1875535820.28748489.48044299995
99996184.627517e+154.250421e+15207.922021.04.014.00.042.00.0...2.462849643.233495107.002497176.41596488.323595114.670607156.8475535816.62748493.14044299996
\n", - "

99997 rows × 30 columns

\n", - "
" - ], - "text/plain": [ - " transaction_category receiver_id sender_id amount year month \\\n", - "0 0 4.518552e+15 4.333582e+15 833.26 2021.0 3.0 \n", - "1 0 4.518552e+15 4.642413e+15 596.63 2021.0 2.0 \n", - "2 0 4.274544e+15 4.952666e+15 176.76 2021.0 2.0 \n", - "3 0 4.518552e+15 4.457299e+15 879.78 2021.0 4.0 \n", - "4 0 4.601853e+15 4.578126e+15 742.25 2021.0 4.0 \n", - "... ... ... ... ... ... ... \n", - "99992 18 4.405008e+15 4.583356e+15 205.43 2021.0 4.0 \n", - "99993 18 4.300417e+15 4.949241e+15 151.49 2021.0 3.0 \n", - "99994 18 4.405008e+15 4.996896e+15 188.28 2021.0 3.0 \n", - "99995 18 4.262047e+15 4.017367e+15 204.26 2021.0 2.0 \n", - "99996 18 4.627517e+15 4.250421e+15 207.92 2021.0 4.0 \n", - "\n", - " day hour minute second ... dist_18 dist_2 dist_3 \\\n", - "0 10.0 19.0 57.0 42.0 ... 627.802849 17.893495 732.342497 \n", - "1 11.0 17.0 53.0 32.0 ... 391.172849 254.523495 495.712497 \n", - "2 21.0 18.0 29.0 32.0 ... 28.697151 674.393495 75.842497 \n", - "3 9.0 16.0 14.0 19.0 ... 674.322849 28.626505 778.862497 \n", - "4 4.0 15.0 50.0 16.0 ... 536.792849 108.903495 641.332497 \n", - "... ... ... ... ... ... ... ... ... \n", - "99992 20.0 12.0 23.0 53.0 ... 0.027151 645.723495 104.512497 \n", - "99993 24.0 19.0 30.0 18.0 ... 53.967151 699.663495 50.572497 \n", - "99994 8.0 19.0 51.0 10.0 ... 17.177151 662.873495 87.362497 \n", - "99995 14.0 23.0 25.0 7.0 ... 1.197151 646.893495 103.342497 \n", - "99996 14.0 0.0 42.0 0.0 ... 2.462849 643.233495 107.002497 \n", - "\n", - " dist_4 dist_5 dist_6 dist_7 dist_8 \\\n", - "0 801.755964 713.663595 740.010607 782.187553 5191.287484 \n", - "1 565.125964 477.033595 503.380607 545.557553 5427.917484 \n", - "2 145.255964 57.163595 83.510607 125.687553 5847.787484 \n", - "3 848.275964 760.183595 786.530607 828.707553 5144.767484 \n", - "4 710.745964 622.653595 649.000607 691.177553 5282.297484 \n", - "... ... ... ... ... ... \n", - "99992 173.925964 85.833595 112.180607 154.357553 5819.117484 \n", - "99993 119.985964 31.893595 58.240607 100.417553 5873.057484 \n", - "99994 156.775964 68.683595 95.030607 137.207553 5836.267484 \n", - "99995 172.755964 84.663595 111.010607 153.187553 5820.287484 \n", - "99996 176.415964 88.323595 114.670607 156.847553 5816.627484 \n", - "\n", - " dist_9 transaction_id \n", - "0 718.480442 0 \n", - "1 481.850442 1 \n", - "2 61.980442 2 \n", - "3 765.000442 3 \n", - "4 627.470442 4 \n", - "... ... ... \n", - "99992 90.650442 99992 \n", - "99993 36.710442 99993 \n", - "99994 73.500442 99994 \n", - "99995 89.480442 99995 \n", - "99996 93.140442 99996 \n", - "\n", - "[99997 rows x 30 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import mlrun.feature_store as fstore\n", - "data = extended_transactions_set.ingest(data, overwrite=True)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "id": "e2f6395f", - "metadata": {}, - "source": [ - "And display them after getting them from the feature store" - ] - }, - { - "cell_type": "markdown", - "id": "cf148985", - "metadata": {}, - "source": [ - "We use the feature store to calculate the distance between the average of every category and the current amount" - ] - }, - { - "cell_type": "markdown", - "id": "289eeca6", - "metadata": {}, - "source": [ - "### 4. Create model \n", - "In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).\n", - "\n", - "\n", - "\n", - "Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "bb4bdd8d", - "metadata": {}, - "outputs": [], - "source": [ - "# Randomly sort the data then split out first 70%, second 20%, and last 10%\n", - "train_data, validation_data, test_data = np.split(\n", - " data.sample(frac=1, random_state=42), [int(0.7 * len(data)), int(0.9 * len(data))]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f81f65b9", - "metadata": {}, - "source": [ - "We save these sets to a file." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "f849a7a9", - "metadata": {}, - "outputs": [], - "source": [ - "train_data.to_csv(\"train.csv\", index=False, header=False)\n", - "validation_data.to_csv(\"validation.csv\", index=False, header=False)\n", - "test_data.to_csv(\"test.csv\", index=False, header=True)" - ] - }, - { - "cell_type": "markdown", - "id": "de669936", - "metadata": {}, - "source": [ - "And upload these files to our s3 bucket" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "e1ca2543", - "metadata": {}, - "outputs": [], - "source": [ - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"train/train.csv\")\n", - ").upload_file(\"train.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"validation/validation.csv\")\n", - ").upload_file(\"validation.csv\")\n", - "boto3.Session().resource(\"s3\").Bucket(s3_bucket).Object(\n", - " os.path.join(bucket_prefix, \"test/test.csv\")\n", - ").upload_file(\"test.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "22de532f", - "metadata": {}, - "source": [ - "Get the XGBoost sagemaker image" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "a41b6a7d", - "metadata": {}, - "outputs": [], - "source": [ - "container = sagemaker.image_uris.retrieve(region=region, framework=\"xgboost\", version=\"1.2-2\")" - ] - }, - { - "cell_type": "markdown", - "id": "66cae2a9", - "metadata": {}, - "source": [ - "Transform our data to a sagemaker input for training" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "e51c917a", - "metadata": {}, - "outputs": [], - "source": [ - "s3_input_train = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/train\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")\n", - "s3_input_validation = sagemaker.inputs.TrainingInput(\n", - " s3_data=\"s3://{}/{}/validation/\".format(s3_bucket, bucket_prefix), content_type=\"csv\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6f2985d8", - "metadata": {}, - "source": [ - "We define the XGBoost model" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "92c1fe8c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb = sagemaker.estimator.Estimator(\n", - " container,\n", - " role,\n", - " instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " output_path=\"s3://{}/{}/output\".format(s3_bucket, bucket_prefix),\n", - " sagemaker_session=sagemaker_session,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ecafdfe8", - "metadata": {}, - "source": [ - "Set the parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "582adc6c", - "metadata": {}, - "outputs": [], - "source": [ - "xgb.set_hyperparameters(\n", - " max_depth=5,\n", - " eta=0.2,\n", - " gamma=4,\n", - " min_child_weight=6,\n", - " subsample=0.8,\n", - " objective=\"multi:softprob\",\n", - " num_class=19,\n", - " verbosity=0,\n", - " num_round=100,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b36463dd", - "metadata": {}, - "source": [ - "And train the model" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "c24e06fc", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-25-14-12-01-149\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-01-25 14:12:01 Starting - Starting the training job...\n", - "2024-01-25 14:12:18 Starting - Preparing the instances for training.........\n", - "2024-01-25 14:13:58 Downloading - Downloading input data......\n", - "2024-01-25 14:14:34 Downloading - Downloading the training image...\n", - "2024-01-25 14:15:29 Training - Training image download completed. Training in progress...\u001b[34m[2024-01-25 14:15:41.041 ip-10-2-106-129.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Single node training.\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Train matrix has 69997 rows and 29 columns\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Validation matrix has 20000 rows\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.342 ip-10-2-106-129.ec2.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.343 ip-10-2-106-129.ec2.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler.\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:41.344 ip-10-2-106-129.ec2.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.\u001b[0m\n", - "\u001b[34m[2024-01-25:14:15:41:INFO] Debug hook created from config\u001b[0m\n", - "\u001b[34m[0]#011train-merror:0.00047#011validation-merror:0.00050\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:42.380 ip-10-2-106-129.ec2.internal:7 INFO hook.py:423] Monitoring the collections: metrics\u001b[0m\n", - "\u001b[34m[2024-01-25 14:15:42.383 ip-10-2-106-129.ec2.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7\u001b[0m\n", - "\u001b[34m[1]#011train-merror:0.00023#011validation-merror:0.00040\u001b[0m\n", - "\u001b[34m[2]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[3]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[4]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[5]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[6]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[7]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[8]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[9]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[10]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[11]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[12]#011train-merror:0.00001#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[13]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[14]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[15]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[16]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[17]#011train-merror:0.00001#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[18]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[19]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[20]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[21]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[22]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[23]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[24]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[25]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[26]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[27]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[28]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[29]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[30]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[31]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[32]#011train-merror:0.00000#011validation-merror:0.00015\u001b[0m\n", - "\u001b[34m[33]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[34]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[35]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[36]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[37]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[38]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[39]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[40]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[41]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[42]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[43]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[44]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[45]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[46]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[47]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[48]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[49]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[50]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[51]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[52]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[53]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[54]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[55]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[56]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[57]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[58]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[59]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[60]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[61]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[62]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[63]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[64]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[65]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[66]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[67]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[68]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[69]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[70]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[71]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[72]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[73]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[74]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[75]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[76]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[77]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[78]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[79]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[80]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[81]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[82]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[83]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[84]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[85]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[86]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[87]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[88]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[89]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\n", - "2024-01-25 14:17:00 Uploading - Uploading generated training model\u001b[34m[90]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[91]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[92]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[93]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[94]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[95]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[96]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[97]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[98]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\u001b[34m[99]#011train-merror:0.00000#011validation-merror:0.00010\u001b[0m\n", - "\n", - "2024-01-25 14:17:16 Completed - Training job completed\n", - "Training seconds: 198\n", - "Billable seconds: 198\n" - ] - } - ], - "source": [ - "xgb.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})" - ] - }, - { - "cell_type": "markdown", - "id": "8b716cd7", - "metadata": {}, - "source": [ - "### 5. Using the endpoint \n", - "\n", - "Deploy the model to an endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "78444d49-4ad3-49e4-a579-19b173facb26", - "metadata": {}, - "outputs": [], - "source": [ - "serving_function = project.get_function(\"serving\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "bd061d8f-82e5-4ed2-b3ed-d994fadaceaa", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "mlrun-flow\n", - "\n", - "\n", - "\n", - "_start\n", - "\n", - "start\n", - "\n", - "\n", - "\n", - "xgboost-model\n", - "\n", - "xgboost-model\n", - "\n", - "\n", - "\n", - "_start->xgboost-model\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "postprocess\n", - "\n", - "postprocess\n", - "\n", - "\n", - "\n", - "xgboost-model->postprocess\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set the topology and get the graph object:\n", - "graph = serving_function.set_topology(\"flow\", engine=\"async\")\n", - "\n", - "# Add the steps:\n", - "graph.to(\"XGBModelServer\",\n", - " name=\"xgboost-model\",\n", - " model_path=xgb.model_data) \\\n", - " .to(handler=\"postprocess\", name=\"postprocess\").respond()\n", - "\n", - "# Plot to graph:\n", - "serving_function.plot(rankdir='LR')" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "ceae49b8-98d1-4c00-8bde-597ee80cf6e3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:17:46,696 [info] Starting remote function deploy\n", - "2024-01-25 14:17:46 (info) Deploying function\n", - "2024-01-25 14:17:46 (info) Building\n", - "2024-01-25 14:17:47 (info) Staging files and preparing base images\n", - "2024-01-25 14:17:47 (info) Building processor image\n", - "2024-01-25 14:19:32 (info) Build complete\n", - "2024-01-25 14:19:40 (info) Function deploy complete\n", - "> 2024-01-25 14:19:48,105 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/']}\n" - ] - }, - { - "data": { - "text/plain": [ - "DeployStatus(state=ready, outputs={'endpoint': 'http://sagemaker-yoni-serving-sagemaker-yoni.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/', 'name': 'sagemaker-yoni-serving'})" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "project.deploy_function(\"serving\")" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c858e3e9-9e43-4148-8015-6047565db456", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'test_data' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m samples \u001b[38;5;241m=\u001b[39m \u001b[43mtest_data\u001b[49m\u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtransaction_category\u001b[39m\u001b[38;5;124m'\u001b[39m,axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)[:\u001b[38;5;241m500\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\u001b[38;5;241m.\u001b[39mtolist()\n", - "\u001b[0;31mNameError\u001b[0m: name 'test_data' is not defined" - ] - } - ], - "source": [ - "samples = test_data.drop('transaction_category',axis=1)[:500].values.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "de741da6-8ff6-4f60-bae4-3c1d11df87c4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:19:48,167 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-sagemaker-yoni-serving.default-tenant.svc.cluster.local:8080/predict'}\n" - ] - } - ], - "source": [ - "response = serving_function.invoke(path='/predict', body={\"inputs\": samples})" - ] - }, - { - "cell_type": "markdown", - "id": "712f4d35", - "metadata": {}, - "source": [ - "### 6. Evaluate performance \n", - "\n", - "Run the model on our test data" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "2e863ea7-5804-4637-b677-390c305cabfe", - "metadata": {}, - "outputs": [], - "source": [ - "s3_data = \"s3://{}/{}/test/test.csv\".format(s3_bucket, bucket_prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "507de272-df4e-4fbe-be2e-cd99fae1b63a", - "metadata": {}, - "source": [ - "Add the evaluation function to our project" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "ca4f7e49", - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_function = project.get_function(\"evaluate\")" - ] - }, - { - "cell_type": "markdown", - "id": "9ba13872-7f0e-4033-96ce-ad8cde950442", - "metadata": {}, - "source": [ - "Run the evaluation job" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "b6eab7af-e967-4e22-9817-18cc4bf2db0c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:19:48,410 [info] Storing function: {'name': 'evaluate-evaluate', 'uid': 'cac9cd3c55ba40d58fbe1156d4861e79', 'db': 'http://mlrun-api:8080'}\n", - "> 2024-01-25 14:19:48,708 [info] Job is running in the background, pod: evaluate-evaluate-5rrtk\n", - "[14:19:52] WARNING: /workspace/src/common/error_msg.h:80: If you are loading a serialized model (like pickle in Python, RDS in R) or\n", - "configuration generated by an older version of XGBoost, please export the model by calling\n", - "`Booster.save_model` from that version first, then load it back in current version. See:\n", - "\n", - " https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html\n", - "\n", - "for more details about differences between saving model and serializing.\n", - "\n", - "> 2024-01-25 14:19:53,802 [info] To track results use the CLI: {'info_cmd': 'mlrun get run cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni', 'logs_cmd': 'mlrun logs cac9cd3c55ba40d58fbe1156d4861e79 -p sagemaker-yoni'}\n", - "> 2024-01-25 14:19:53,802 [info] Or click for UI: {'ui_url': 'https://dashboard.default-tenant.app.app-lab-2-b688.iguazio-cd2.com/mlprojects/sagemaker-yoni/jobs/monitor/cac9cd3c55ba40d58fbe1156d4861e79/overview'}\n", - "> 2024-01-25 14:19:53,803 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
sagemaker-yoni0Jan 25 14:19:51completedevaluate-evaluate
v3io_user=yoni
kind=job
owner=yoni
mlrun/client_version=1.6.0-rc21
mlrun/client_python_version=3.9.16
host=evaluate-evaluate-5rrtk
model_path=s3://sagemaker-us-east-1-934638699319/payment-classification/output/sagemaker-xgboost-2024-01-25-14-12-01-149/output/model.tar.gz
model_name=xgboost-model
test_set=s3://sagemaker-us-east-1-934638699319/payment-classification/test/test.csv
label_column=transaction_category
factorize_key={'Uncategorized': '0', 'Entertainment': '1', 'Education': '2', 'Shopping': '3', 'Personal Care': '4', 'Health and Fitness': '5', 'Food and Dining': '6', 'Gifts and Donations': '7', 'Investments': '8', 'Bills and Utilities': '9', 'Auto and Transport': '10', 'Travel': '11', 'Fees and Charges': '12', 'Business Services': '13', 'Personal Services': '14', 'Taxes': '15', 'Gambling': '16', 'Home': '17', 'Pension and insurances': '18'}
classification_report
\n", - "
\n", - "
\n", - "
\n", - " Title\n", - " ×\n", - "
\n", - " \n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - " > to track results use the .show() or .logs() methods or click here to open in UI" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2024-01-25 14:19:59,831 [info] Run execution finished: {'status': 'completed', 'name': 'evaluate-evaluate'}\n" - ] - } - ], - "source": [ - "evaluate_run = evaluate_function.run(\n", - " handler=\"evaluate\",\n", - " params={\n", - " \"model_path\": xgb.model_data,\n", - " \"model_name\": \"xgboost-model\",\n", - " \"test_set\": s3_data,\n", - " \"label_column\": \"transaction_category\",\n", - " \"factorize_key\": factorize_key,\n", - " },\n", - " returns=[\"classification_report: dataset\"])" - ] - }, - { - "cell_type": "markdown", - "id": "ffc4326e-3085-47e1-b1f6-97d5eceba893", - "metadata": {}, - "source": [ - "See the evaluation result" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "3a9c30bd-a3bf-49f1-b57e-1490f3da00f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
precisionrecallf1-scoresupport
Uncategorized1.0000001.0000001.00000051.0000
Entertainment1.0000001.0000001.0000001486.0000
Education1.0000001.0000001.00000080.0000
Shopping1.0000001.0000001.0000003441.0000
Personal Care1.0000001.0000001.000000132.0000
Health and Fitness1.0000001.0000001.000000443.0000
Food and Dining1.0000001.0000001.000000918.0000
Gifts and Donations1.0000001.0000001.000000275.0000
Investments1.0000001.0000001.00000088.0000
Bills and Utilities1.0000001.0000001.000000332.0000
Auto and Transport1.0000001.0000001.0000001967.0000
Travel1.0000001.0000001.000000120.0000
Fees and Charges1.0000001.0000001.000000106.0000
Business Services1.0000001.0000001.000000146.0000
Personal Services1.0000001.0000001.00000075.0000
Taxes1.0000000.9787230.98924747.0000
Gambling0.9375001.0000000.96774215.0000
Home1.0000001.0000001.000000168.0000
Pension and insurances1.0000001.0000001.000000110.0000
accuracy0.9999000.9999000.9999000.9999
macro avg0.9967110.9988800.99773610000.0000
weighted avg0.9999060.9999000.99990110000.0000
\n", - "
" - ], - "text/plain": [ - " precision recall f1-score support\n", - "Uncategorized 1.000000 1.000000 1.000000 51.0000\n", - "Entertainment 1.000000 1.000000 1.000000 1486.0000\n", - "Education 1.000000 1.000000 1.000000 80.0000\n", - "Shopping 1.000000 1.000000 1.000000 3441.0000\n", - "Personal Care 1.000000 1.000000 1.000000 132.0000\n", - "Health and Fitness 1.000000 1.000000 1.000000 443.0000\n", - "Food and Dining 1.000000 1.000000 1.000000 918.0000\n", - "Gifts and Donations 1.000000 1.000000 1.000000 275.0000\n", - "Investments 1.000000 1.000000 1.000000 88.0000\n", - "Bills and Utilities 1.000000 1.000000 1.000000 332.0000\n", - "Auto and Transport 1.000000 1.000000 1.000000 1967.0000\n", - "Travel 1.000000 1.000000 1.000000 120.0000\n", - "Fees and Charges 1.000000 1.000000 1.000000 106.0000\n", - "Business Services 1.000000 1.000000 1.000000 146.0000\n", - "Personal Services 1.000000 1.000000 1.000000 75.0000\n", - "Taxes 1.000000 0.978723 0.989247 47.0000\n", - "Gambling 0.937500 1.000000 0.967742 15.0000\n", - "Home 1.000000 1.000000 1.000000 168.0000\n", - "Pension and insurances 1.000000 1.000000 1.000000 110.0000\n", - "accuracy 0.999900 0.999900 0.999900 0.9999\n", - "macro avg 0.996711 0.998880 0.997736 10000.0000\n", - "weighted avg 0.999906 0.999900 0.999901 10000.0000" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluate_run.artifact(\"classification_report\").as_df()" - ] - }, - { - "cell_type": "markdown", - "id": "98d0b67e", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "You should see results similar to this:\n", - "\n", - "```\n", - " precision recall f1-score support\n", - "\n", - " Uncategorized 1.00 0.92 0.96 51\n", - " Entertainment 0.81 0.89 0.85 1486\n", - " Education 1.00 0.94 0.97 80\n", - " Shopping 0.86 0.94 0.90 3441\n", - " Personal Care 1.00 0.98 0.99 132\n", - " Health and Fitness 0.99 0.89 0.94 443\n", - " Food and Dining 0.99 0.82 0.90 918\n", - " Gifts and Donations 1.00 0.95 0.97 275\n", - " Investments 0.99 0.97 0.98 88\n", - " Bills and Utilities 1.00 0.99 1.00 332\n", - " Auto and Transport 0.94 0.84 0.88 1967\n", - " Travel 0.96 0.84 0.90 120\n", - " Fees and Charges 1.00 0.94 0.97 106\n", - " Business Services 1.00 0.99 1.00 146\n", - " Personal Services 1.00 0.96 0.98 75\n", - " Taxes 0.98 0.94 0.96 47\n", - " Gambling 1.00 1.00 1.00 15\n", - " Home 0.98 0.89 0.93 168\n", - "Pension and insurances 0.99 1.00 1.00 110\n", - "\n", - " accuracy 0.90 10000\n", - " macro avg 0.97 0.93 0.95 10000\n", - " weighted avg 0.91 0.90 0.90 10000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "49fdc82d", - "metadata": {}, - "source": [ - "### 7. Clean up \n", - "\n", - "Remove the feature group and endpoint to clean up" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "f79b1164", - "metadata": {}, - "outputs": [], - "source": [ - "#feature_group.delete()\n", - "#xgb_predictor.delete_endpoint(delete_endpoint_config=True)" - ] - }, - { - "cell_type": "markdown", - "id": "e04b6fa6", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Notebook CI Test Results\n", - "\n", - "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", - "\n", - "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n", - "\n", - "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/use-cases|financial_payment_classification|financial_payment_classification.ipynb)\n" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "smdemo", - "language": "python", - "name": "smdemo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 2f9f0d00bcbe7f096e7594f15a74a36854367dc6 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 12 Feb 2024 14:04:26 +0000 Subject: [PATCH 16/16] delete unused utils --- utils.py | 63 -------------------------------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 utils.py diff --git a/utils.py b/utils.py deleted file mode 100644 index 4ec4cf3..0000000 --- a/utils.py +++ /dev/null @@ -1,63 +0,0 @@ -import pandas as pd -import datetime - -#from datetime import datetime, timedelta - -def update_timestamps(data): - - # Step 3: Get the current time - now = pd.Timestamp(datetime.datetime.now()) - - # Step 4: Calculate the time difference - time_difference = now - data['timestamp'].iloc[-1] - - # Step 5: Adjust all timestamps - data['timestamp'] = data['timestamp'] + time_difference - - # Display the adjusted DataFrame - return data - - -# # Function that updates the timestamps so each transaction category has rows with timestamps from the last 5 days (2 per day) -# def update_timestamps(data): -# # Get today's date -# today = datetime.today() - -# # Calculate the dates for the last 5 days -# last_5_days = [today - timedelta(days=i) for i in range(4, -1, -1)] # Reverse for chronological order - -# # Extract year, month, and day from each date object -# years = [d.year for d in last_5_days] -# months = [d.month for d in last_5_days] -# days = [d.day for d in last_5_days] - -# hours = [10, 15] - -# # Create a list of timestamps of the last 5 days, 2 timestamps per day. -# times = [] -# for year, month, day in zip(years, months, days): -# for hour in hours: -# times.append(datetime(year, month, day, hour)) - -# # Iterate over each transaction category -# for i in range(len(data["transaction_category"].unique())): -# # Extract all the rows for each category -# category_data = data[data['transaction_category'] == str(i)] - -# # Ensure timestamp is a datetime object -# pd.to_datetime(category_data.timestamp) - -# # Sort DataFrame by timestamp in descending order -# category_data_sorted = category_data.sort_values(by='timestamp', ascending=False) - -# # Select the latest rows and update their timestamp -# latest_rows = category_data_sorted.head(len(times)) -# latest_rows.loc[:, 'timestamp'] = times - -# # Update the initial dataframe to include those updated rows -# data.update(latest_rows) - -# data.sort_values(["transaction_category", "timestamp"], inplace=True) - - -# return data \ No newline at end of file