analysis-1.py

# -*- coding: utf-8 -*-
"""Analysis.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1amM_iYzI73lNVlDyvf2yz5VEDlbfDZoT

# Cleaning
"""

# Importing libraries and setting up access to Drive
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

df = pd.read_excel('/content/up10k.xlsx')

# Drop the ID column and index column
df = df.drop(columns=['id', 'Unnamed: 0'])

# Clean condition column

# Split the "condition" column into three columns using commas as the separator

df[['accidents', 'num_owners', 'personal_use']] = df['condition'].str.split(',', expand=True)

df.drop('condition', axis=1, inplace=True)

# Define a function to extract the numeric portion of a string
def extract_number(s):
    try:
        return int(s.split()[0])
    except ValueError:
        return 0

# Apply the function to the "accidents" column 
df['accidents'] = df['accidents'].apply(extract_number)

# Apply to "num_owners"
df['num_owners'] = df['num_owners'].apply(extract_number)

# Dummy for "personal_use" or "fleet_use"
df['personal_use'] = df['personal_use'].str.contains('personal', case=False).astype(int)

import re
# Remove non-numeric characters from the "mileage" values using regular expressions
df['mileage'] = df['mileage'].apply(lambda x: re.sub('[^0-9]', '', x))
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')


# For model:
# Create dummies for every unique name in column "model"
model_dummies = pd.get_dummies(df['model'])

# Add prefix "model" to the new columns
model_dummies = model_dummies.add_prefix('model_')

# Drop the old "model" column
df.drop('model', axis=1, inplace=True)

# Concatenate the new "model" dummies columns to the original dataframe
df = pd.concat([df, model_dummies], axis=1)


# For location:
# Extract state code from "location" column
df["State"] = df["location"].str.split(",").str[1].str.strip()

# Create dummies for state code
state_dummies = pd.get_dummies(df["State"], prefix="loc")

# Add the state dummies to the original dataframe
df = pd.concat([df, state_dummies], axis=1)

# Inspecting the columns with most missing observations
df.isnull().sum()

df = df.dropna()
df = df.reset_index(drop=True)

# GDP data
gdp = pd.read_excel('/content/statistic_id248053_us-real-gross-domestic-product-2022-by-state.xlsx')

state_abbr = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

# use .replace() method to change state names to abbreviations
gdp['State'] = gdp['State'].replace(state_abbr)

merged_df = pd.merge(df, gdp, on='State')

merged_df = merged_df.rename(columns={'Real Gross Domestic Product (GDP) of the United States in Q3 2022, by state (in billion chained 2012 U.S. dollars)': 'gdp'})
df = merged_df

# Drop original "location" and "state" columns
df.drop(["location", "State"], axis=1, inplace=True)

df.describe()

df.to_csv('full_data.csv', index=False)

"""# Analysis

## Optimizing
"""

# Modules
import inspect
import numpy as np
import pandas as pd

from sklearn import ensemble, metrics, model_selection, preprocessing, tree
from matplotlib import pyplot
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, recall_score

# separate "price" column from df and assign it to a new variable
target = df.pop('price')

# sample split:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df, target, train_size=.75, test_size=.25, 
    shuffle=True)

# ============================== tree tuning ==============================:
# Define the parameter grid to search over
param_grid = {
    "max_depth": [2, 4, 6, 8],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [1, 2, 4],
}

# Create a decision tree regressor object
dt_reg = DecisionTreeRegressor()

# Create a GridSearchCV object with the parameter grid and 5-fold cross validation
grid_search = GridSearchCV(
    dt_reg, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding mean squared error
print(
    f"Best hyperparameters: {grid_search.best_params_} \nBest negative mean squared error: {grid_search.best_score_}"
)

# RF tuning

# Define the parameter grid to search over
param_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [2, 4, 6, 8, 10],
    "min_samples_split": [2, 4, 8, 16],
    "min_samples_leaf": [1, 2, 4, 8],
}

# Create a Random Forest regressor object
rf_reg = RandomForestRegressor()

# Create a GridSearchCV object with the parameter grid and 5-fold cross validation
grid_search = GridSearchCV(
    rf_reg, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding mean squared error
print(
    f"Best hyperparameters: {grid_search.best_params_} \nBest negative mean squared error: {grid_search.best_score_}"
)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# KNN tuning

# Define the parameter grid to search over
param_grid = {
    'n_neighbors': [3, 5, 10, 15],
}

# Create a KNN regressor object
knn_reg = KNeighborsRegressor()

# Create a GridSearchCV object with the parameter grid and 5-fold cross validation
grid_search = GridSearchCV(knn_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding mean squared error
print(f'Best hyperparameters: {grid_search.best_params_} \nBest negative mean squared error: {grid_search.best_score_}')

"""## Fitting"""

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Train linear regression 
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_preds)
print(f"Linear Regression MSE: {lr_mse}")
lr_importances = pd.DataFrame({"feature": X_train.columns, "importance": lr.coef_}).nlargest(5, "importance")
print(f"Top 5 Linear Regression Features: \n{lr_importances}")

# Train decision tree
dt = DecisionTreeRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=8)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_preds)
print(f"Decision Tree MSE: {dt_mse}")
dt_importances = pd.DataFrame({"feature": X_train.columns, "importance": dt.feature_importances_}).nlargest(5, "importance")
print(f"Top 5 Decision Tree Features: \n{dt_importances}")

# Train random forest
rf = RandomForestRegressor(max_depth=10, min_samples_leaf=2, min_samples_split=8, n_estimators=100)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_preds)
print(f"Random Forest MSE: {rf_mse}")
rf_importances = pd.DataFrame({"feature": X_train.columns, "importance": rf.feature_importances_}).nlargest(5, "importance")
print(f"Top 5 Random Forest Features: \n{rf_importances}")

# Train KNN
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
knn_mse = mean_squared_error(y_test, knn_preds)
print(f"KNN MSE: {knn_mse}")
print(f"Top 5 KNN Features: Not applicable")