diff --git "a/\355\225\255\352\263\265\355\216\270\354\247\200\354\227\260\354\230\210\354\270\241.ipynb" "b/\355\225\255\352\263\265\355\216\270\354\247\200\354\227\260\354\230\210\354\270\241.ipynb" new file mode 100644 index 0000000..6f8901e --- /dev/null +++ "b/\355\225\255\352\263\265\355\216\270\354\247\200\354\227\260\354\230\210\354\270\241.ipynb" @@ -0,0 +1,757 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyMBkechRUQoGPamLiP49sv5", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "RQowCrZVSrRM" + }, + "outputs": [], + "source": [ + "import random\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import gc\n", + "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n", + "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer\n", + "from xgboost import XGBClassifier" + ] + }, + { + "cell_type": "code", + "source": [ + "def seed_everything(seed):\n", + " random.seed(seed)\n", + " os.environ['PYTHONHASHSEED'] = str(seed)\n", + " np.random.seed(seed)\n", + "\n", + "seed_everything(42) # Fixed Seed\n", + "\n", + "def csv_to_parquet(csv_path, save_name):\n", + " df = pd.read_csv(csv_path)\n", + " df.to_parquet(f'./{save_name}.parquet')\n", + " del df\n", + " gc.collect()\n", + " print(save_name, 'Done.')" + ], + "metadata": { + "id": "SzoGNcLvSv8O" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# drive에 연결합니다.\n", + "from google.colab import drive\n", + "drive.mount('/content/drive', force_remount=True)\n", + "\n", + "# 현재 데이터가 있는 공간으로 작업 경로를 변경해줍니다.\n", + "%cd \"/content/drive/MyDrive/dacon/open_1\"" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TYV5jX2yYkx1", + "outputId": "cff46d69-e744-49ca-d23e-76db3459a8ab" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "/content/drive/MyDrive/dacon/open_1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "csv_to_parquet('train.csv', 'train')\n", + "csv_to_parquet('test.csv', 'test')\n", + "\n", + "train = pd.read_parquet('./train.parquet')\n", + "test = pd.read_parquet('./test.parquet')\n", + "sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uC9I13rAXf9L", + "outputId": "a5cb84f5-6690-4d1e-cdcd-ad07bdea267f" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "train Done.\n", + "test Done.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다\n", + "NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']\n", + "\n", + "for col in NaN_col:\n", + " mode = train[col].mode()[0]\n", + " train[col] = train[col].fillna(mode)\n", + "\n", + " if col in test.columns:\n", + " test[col] = test[col].fillna(mode)\n", + "print('Done.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "P_OEaI3qY6fU", + "outputId": "291b3be3-69fd-4330-a1fb-37f25574bbea" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Done.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# cols = train.select_dtypes(include=['object'])\n", + "# le = LabelEncoder()\n", + "\n", + "# for col in cols:\n", + "# train[col] = le.fit_transform(train[col])\n", + "# test[col] = le.transform(test[col])" + ], + "metadata": { + "id": "b02EcTEfbeAX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#질적 변수들을 수치화합니다\n", + "qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']\n", + "\n", + "for i in qual_col:\n", + " le = LabelEncoder()\n", + " le=le.fit(train[i])\n", + " train[i]=le.transform(train[i])\n", + "\n", + " for label in np.unique(test[i]):\n", + " if label not in le.classes_:\n", + " le.classes_ = np.append(le.classes_, label)\n", + " test[i]=le.transform(test[i])\n", + "print('Done.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "npCizVkPY8z0", + "outputId": "4ef74ffc-c836-447f-d3a8-84be8123d744" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Done.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#레이블이 없는 데이터들을 제거합니다\n", + "train = train.dropna()" + ], + "metadata": { + "id": "U_FNfAPicEDy" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "column_number = {}\n", + "for i, column in enumerate(sample_submission.columns):\n", + " column_number[column] = i\n", + "\n", + "def to_number(x, dic):\n", + " return dic[x]\n", + "\n", + "train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))\n", + "print('Done.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NwrUzHKtcEif", + "outputId": "ca472243-f309-43e3-d72c-62ea65d66f46" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Done.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])\n", + "train_y = train['Delay_num']\n", + "test_x = test.drop(columns=['ID'])" + ], + "metadata": { + "id": "BArQV8xycHhg" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)" + ], + "metadata": { + "id": "SAaBR2ledpbW" + }, + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "scaler = StandardScaler()\n", + "train_x = scaler.fit_transform(train_x)\n", + "val_x = scaler.transform(val_x)\n", + "test_x = scaler.transform(test_x)" + ], + "metadata": { + "id": "kPz2yfVedxwc" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, log_loss\n", + "\n", + "dt = DecisionTreeClassifier()\n", + "dt.fit(train_x, train_y)\n", + "pred_1 = dt.predict(val_x)\n", + "\n", + "accuracy = accuracy_score(val_y, pred_1)\n", + "f1 = f1_score(val_y, pred_1, average='weighted')\n", + "precision = precision_score(val_y, pred_1, average='weighted')\n", + "recall = recall_score(val_y, pred_1, average='weighted')\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print(f'F1 Score: {f1}')\n", + "print(f'Precision: {precision}')\n", + "print(f'Recall: {recall}')\n", + "\n", + "cm1 = confusion_matrix(val_y, pred_1)\n", + "print(cm1)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QTdI1KxxcK0k", + "outputId": "24c24fab-ef94-413e-e570-a847fe8ac3ae" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.7101625458324347\n", + "F1 Score: 0.7157154247660936\n", + "Precision: 0.7216520312641815\n", + "Recall: 0.7101625458324347\n", + "[[34023 7869]\n", + " [ 6913 2196]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "rf = RandomForestClassifier(random_state=42)\n", + "\n", + "rf.fit(train_x, train_y)\n", + "pred_2 = rf.predict(val_x)\n", + "\n", + "accuracy = accuracy_score(val_y, pred_2)\n", + "f1 = f1_score(val_y, pred_2, average='weighted')\n", + "precision = precision_score(val_y, pred_2, average='weighted')\n", + "recall = recall_score(val_y, pred_2, average='weighted')\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print(f'F1 Score: {f1}')\n", + "print(f'Precision: {precision}')\n", + "print(f'Recall: {recall}')\n", + "\n", + "cm2 = confusion_matrix(val_y, pred_2)\n", + "print(cm2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FFFlTzfyep-e", + "outputId": "42cec3e3-24a2-475b-c8f2-c66567e624de" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.8181996431442521\n", + "F1 Score: 0.7489386921906548\n", + "Precision: 0.7463246276782165\n", + "Recall: 0.8181996431442521\n", + "[[41449 443]\n", + " [ 8829 280]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from xgboost import XGBClassifier\n", + "\n", + "xgb = XGBClassifier(random_state=42)\n", + "\n", + "xgb.fit(train_x, train_y)\n", + "pred_3 = xgb.predict(val_x)\n", + "\n", + "accuracy = accuracy_score(val_y, pred_3)\n", + "f1 = f1_score(val_y, pred_3, average='weighted')\n", + "precision = precision_score(val_y, pred_3, average='weighted')\n", + "recall = recall_score(val_y, pred_3, average='weighted')\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print(f'F1 Score: {f1}')\n", + "print(f'Precision: {precision}')\n", + "print(f'Recall: {recall}')\n", + "\n", + "cm3 = confusion_matrix(val_y, pred_3)\n", + "print(cm3)" + ], + "metadata": { + "id": "4jN9KiZSfaA6", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f0b83813-4522-41e6-b5b2-dda8e5cd7169" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.8212976216152624\n", + "F1 Score: 0.7490495880677704\n", + "Precision: 0.7654504972612673\n", + "Recall: 0.8212976216152624\n", + "[[41655 237]\n", + " [ 8877 232]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from lightgbm import LGBMClassifier\n", + "\n", + "lgbm = LGBMClassifier()\n", + "lgbm.fit(train_x, train_y)\n", + "pred_4 = lgbm.predict(val_x)\n", + "\n", + "accuracy = accuracy_score(val_y, pred_4)\n", + "f1 = f1_score(val_y, pred_4, average='weighted')\n", + "precision = precision_score(val_y, pred_4, average='weighted')\n", + "recall = recall_score(val_y, pred_4, average='weighted')\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print(f'F1 Score: {f1}')\n", + "print(f'Precision: {precision}')\n", + "print(f'Recall: {recall}')\n", + "\n", + "cm4 = confusion_matrix(val_y, pred_4)\n", + "print(cm4)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UviJ7wHzR-ul", + "outputId": "78723c7d-def0-4a8e-d0a8-6e102c00a467" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[LightGBM] [Info] Number of positive: 35891, number of negative: 168109\n", + "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074233 seconds.\n", + "You can set `force_col_wise=true` to remove the overhead.\n", + "[LightGBM] [Info] Total Bins 2098\n", + "[LightGBM] [Info] Number of data points in the train set: 204000, number of used features: 15\n", + "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175936 -> initscore=-1.544126\n", + "[LightGBM] [Info] Start training from score -1.544126\n", + "Accuracy: 0.8217682006235172\n", + "F1 Score: 0.7429650884383274\n", + "Precision: 0.7845486323828018\n", + "Recall: 0.8217682006235172\n", + "[[41859 33]\n", + " [ 9057 52]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import log_loss\n", + "\n", + "logloss_1 = log_loss(val_y, pred_1)\n", + "logloss_2 = log_loss(val_y, pred_2)\n", + "logloss_3 = log_loss(val_y, pred_3)\n", + "logloss_4 = log_loss(val_y, pred_4)\n", + "\n", + "f1_1 = f1_score(val_y, pred_1, average='weighted')\n", + "f1_2 = f1_score(val_y, pred_2, average='weighted')\n", + "f1_3 = f1_score(val_y, pred_3, average='weighted')\n", + "f1_4 = f1_score(val_y, pred_4, average='weighted')\n", + "\n", + "# 결과 출력\n", + "print(f'F1_1 Score: {f1_1}')\n", + "print(f'F1_2 Score: {f1_2}')\n", + "print(f'F1_3 Score: {f1_3}')\n", + "print(f'F1_4 Score: {f1_4}')\n", + "\n", + "print(f\"Log Loss_1: {logloss_1}\")\n", + "print(f\"Log Loss_1: {logloss_2}\")\n", + "print(f\"Log Loss_3: {logloss_3}\")\n", + "print(f\"Log Loss_4: {logloss_4}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uCr3tkmYShCm", + "outputId": "a3524f5d-b682-4fbf-e3af-7c7c32a008a1" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "F1_1 Score: 0.7157154247660936\n", + "F1_2 Score: 0.7489386921906548\n", + "F1_3 Score: 0.7490495880677704\n", + "F1_4 Score: 0.7429650884383274\n", + "Log Loss_1: 10.446800737199853\n", + "Log Loss_1: 6.552749048526386\n", + "Log Loss_3: 6.441086586310341\n", + "Log Loss_4: 6.424125199644614\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" + ], + "metadata": { + "id": "KpF0bhFET-wg" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "param_grid = {\n", + " 'learning_rate': [0.01, 0.1],\n", + " 'max_depth': [3, 5 ,8],\n", + " 'n_estimators': [100, 200],\n", + "}" + ], + "metadata": { + "id": "SInHwUomVelh" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "grid = GridSearchCV(xgb,\n", + " param_grid,\n", + " cv=cv,\n", + " scoring='accuracy',\n", + " n_jobs=-1,\n", + " verbose=1)" + ], + "metadata": { + "id": "bh_xTcuSVfsv" + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "grid.fit(train_x,train_y)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 192 + }, + "id": "QeFgvLUrVobH", + "outputId": "be47ea4b-ee66-4d61-f2a6-e65436647ea4" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n", + " estimator=XGBClassifier(base_score=None, booster=None,\n", + " callbacks=None, colsample_bylevel=None,\n", + " colsample_bynode=None,\n", + " colsample_bytree=None, device=None,\n", + " early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None,\n", + " feature_types=None, gamma=None,\n", + " grow_policy=None, importance_typ...\n", + " max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None,\n", + " max_leaves=None, min_child_weight=None,\n", + " missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None,\n", + " n_jobs=None, num_parallel_tree=None,\n", + " random_state=42, ...),\n", + " n_jobs=-1,\n", + " param_grid={'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 8],\n", + " 'n_estimators': [100, 200]},\n", + " scoring='accuracy', verbose=1)" + ], + "text/html": [ + "
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n", + " estimator=XGBClassifier(base_score=None, booster=None,\n", + " callbacks=None, colsample_bylevel=None,\n", + " colsample_bynode=None,\n", + " colsample_bytree=None, device=None,\n", + " early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None,\n", + " feature_types=None, gamma=None,\n", + " grow_policy=None, importance_typ...\n", + " max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None,\n", + " max_leaves=None, min_child_weight=None,\n", + " missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None,\n", + " n_jobs=None, num_parallel_tree=None,\n", + " random_state=42, ...),\n", + " n_jobs=-1,\n", + " param_grid={'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 8],\n", + " 'n_estimators': [100, 200]},\n", + " scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),\n", + " estimator=XGBClassifier(base_score=None, booster=None,\n", + " callbacks=None, colsample_bylevel=None,\n", + " colsample_bynode=None,\n", + " colsample_bytree=None, device=None,\n", + " early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None,\n", + " feature_types=None, gamma=None,\n", + " grow_policy=None, importance_typ...\n", + " max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None,\n", + " max_leaves=None, min_child_weight=None,\n", + " missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None,\n", + " n_jobs=None, num_parallel_tree=None,\n", + " random_state=42, ...),\n", + " n_jobs=-1,\n", + " param_grid={'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 8],\n", + " 'n_estimators': [100, 200]},\n", + " scoring='accuracy', verbose=1)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...)