Skip to content

Commit

Permalink
upload code and ppt
Browse files Browse the repository at this point in the history
  • Loading branch information
elpham6 authored Dec 19, 2023
1 parent 5d46928 commit 9bbaaf3
Show file tree
Hide file tree
Showing 4 changed files with 661 additions and 77 deletions.
7 changes: 7 additions & 0 deletions Final Project README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Final Project README

Code files include Final_Project_EP_TS.py (main file) and tools.py (toolbox).

Download the data, "energydata_complete.csv" to your local machine. Modify the code, if needed, to read the data from there.

The code file to recreate the result is Final_Project_EP_TS.py. Run the file once. If need to review again, please clear the console or interactive console (if using VSCode) and run the entire file again.
Binary file added FinalProjectPresentation.pptx
Binary file not shown.
198 changes: 121 additions & 77 deletions final project draft 2.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,20 @@
from statsmodels.tsa.arima_process import ArmaProcess
from tools import ADF_Cal, Cal_rolling_mean_var, ACF_PACF_Plot, generate_arma,create_gpac_table, kpss_test
from scipy.stats import chi2
############## EDA #################

#%%
############## EDA #################
# upload this to github later for hardcoding
# url =
df = pd.read_csv("energydata_complete.csv")
print(df.head())

# %%
# count missing values
clean_df = df.dropna()
print(clean_df.info)
# data has no missing values

# %%
# Setting a time column as the index
clean_df['date'] = pd.to_datetime(clean_df['date'])
clean_df.set_index('date', inplace=True)

# separate X and Y
Y = clean_df['Appliances']
X = clean_df.drop('Appliances', axis=1)
Expand All @@ -40,26 +36,22 @@
plt.ylabel('Value')
plt.title('Energy vs Time')
plt.show()
# %%
# Plot ACF/PACF
ACF_PACF_Plot(Y, 50)

# %%
# Plotting Heatmap
corr = clean_df.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# %%
##### Split Data #####
from sklearn.model_selection import train_test_split
# Split 80/20 train/test
# Splitting the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, shuffle=False, random_state=6313)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# %%
from statsmodels.tsa.stattools import adfuller, kpss
######## Test stationarity #############
Expand All @@ -71,7 +63,6 @@
# Rolling Mean and Variance
Cal_rolling_mean_var(clean_df, "Appliances")
# Data is stationary

# %%
####### STL Decomp ########
from statsmodels.tsa.seasonal import STL
Expand Down Expand Up @@ -100,30 +91,6 @@ def str_trend_seasonal(T, S, R):
str_trend_seasonal(T,S,R)
# no strong trend or seasonality

# %%
##### Holt-Winters method #####
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
# Fit the Holt-Winters model
hw_model = ExponentialSmoothing(Y_train,
trend='mul', seasonal = "mul",
seasonal_periods = 144,
damped=True).fit() # 144 10-min intervals a day
# Make predictions
hw_pred = hw_model.forecast(len(Y_test))

# Evaluate the model
hw_mse = mean_squared_error(Y_test, hw_pred)
print('Mean Squared Error:', hw_mse)

# Plotting the results
plt.figure()
Y_train.plot(label='Train')
Y_test.plot(label='Test')
hw_pred.plot(label='Holt-Winters Prediction')
plt.legend()
plt.show()

# %%
###### Feature Selection #######
# PCA
Expand All @@ -132,39 +99,77 @@ def str_trend_seasonal(T, S, R):

# Standardize the data
X_std = StandardScaler().fit_transform(X)

# Apply PCA
pca = PCA(n_components=2) # need to fix n_components
pca = PCA(n_components=22)
principalComponents = pca.fit_transform(X_std)

# To view explained variance
pca.explained_variance_ratio_
explained_variance = pca.explained_variance_ratio_
print("Explained Variance by Each Principal Component:", explained_variance)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title("PCA Explained Variance")
plt.show()
#%%
# SVD
U, s, VT = np.linalg.svd(X_std)
# Cond. number
cond_number = np.linalg.cond(X_std)
print("The condition number is: ", cond_number)
plt.figure
plt.plot(s, marker='o')
plt.title("Singular Values")
plt.xlabel("Index")
plt.ylabel("Value")
plt.yscale('log')
plt.show()

#%%
### VIF ###
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Assuming 'X' is your DataFrame of predictors
X_copy = X.copy()
X_copy = add_constant(X_copy) # Adding a constant column for the intercept

# Create a DataFrame that will store VIFs
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns

# Calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)
#%%
# Backwards Stepwise
import statsmodels.api as sm

def backward_elimination(data, target, significance_level = 0.05):
def backward_elimination(data, target, significance_level=0.05):
features = data.columns.tolist()
while(len(features) > 0):
removed_features = [] # List to keep track of removed features

while len(features) > 0:
features_with_constant = sm.add_constant(data[features])
p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]

p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:] # Excluding the constant's p-value
max_p_value = p_values.max()

if max_p_value >= significance_level:
excluded_feature = p_values.idxmax()
features.remove(excluded_feature)
removed_features.append(excluded_feature) # Store the removed feature
print(f"Removed: {excluded_feature}") # Print the removed feature
else:
break

return features
return features, removed_features # Return both the selected and removed features

selected_features = backward_elimination(X, Y)
print(selected_features)
# Apply the function to your data
selected_features, eliminated_features = backward_elimination(X, Y)
print("Selected Features:", selected_features)
print("Eliminated Features:", eliminated_features)
X_train = X_train[selected_features]
X_test = X_test[selected_features]

# %%
###### Baseline Model ######
Expand Down Expand Up @@ -198,17 +203,13 @@ def ses_forecast(train, alpha, h):
alpha = 0.5
# Average Model
avg = average_forecast(Y_train, h)

# Naïve Model
naive = naive_forecast(Y_train, h)

# Drift Model
drift = drift_forecast(Y_train, h)

# Simple Smoothing
simple_model = SimpleExpSmoothing(Y_train[:-h]).fit(smoothing_level=0.2)
simple_forecast = simple_model.forecast(h)

# Exponential Smoothing
exp_model = ExponentialSmoothing(Y_train[:-h], trend='add', seasonal='add', seasonal_periods=144).fit()
exp_forecast = exp_model.forecast(h)
Expand All @@ -227,20 +228,40 @@ def mean_squared_error(y_true, y_pred):
return mse

actual = Y_test

# Calculate MSE for each model
mse_avg = mean_squared_error(actual, avg)
mse_naive = mean_squared_error(actual, naive)
mse_drift = mean_squared_error(actual, drift)
mse_simple = mean_squared_error(actual, simple_forecast)
mse_exp = mean_squared_error(actual, exp_forecast)

# Print MSE values
print("MSE - Average:", mse_avg)
print("MSE - Naive:", mse_naive)
print("MSE - Drift:", mse_drift)
print("MSE - Simple Smoothing:", mse_simple)
print("MSE - Exponential Smoothing:", mse_exp)

# %%
##### Holt-Winters method #####
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
# Fit the Holt-Winters model
hw_model = ExponentialSmoothing(Y_train,
trend='add', seasonal = "add",
seasonal_periods = 144,
damped=True).fit() # 144 10-min intervals a day
# Make predictions
hw_pred = hw_model.forecast(len(Y_test))
# Evaluate the model
hw_mse = mean_squared_error(Y_test, hw_pred)
print('Mean Squared Error (HW Method):', hw_mse)
# Plotting the results
plt.figure()
Y_train.plot(label='Train')
Y_test.plot(label='Test')
hw_pred.plot(label='Holt-Winters Prediction')
plt.legend()
plt.show()
#%%
###### Regression ######
from sklearn.metrics import mean_squared_error, r2_score
Expand All @@ -250,24 +271,20 @@ def mean_squared_error(y_true, y_pred):
X_selected_test = X_test[selected_features]
# Cross-Validation
from sklearn.model_selection import TimeSeriesSplit

# Fit the model
model = sm.OLS(Y_train, sm.add_constant(X_selected_train)).fit()
# One-step ahead prediction
predictions = model.predict(sm.add_constant(X_selected_test))
# Model Evaluation
mse = mean_squared_error(Y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, predictions)
print(f"MSE: {mse}, RMSE: {rmse}, R-squared: {r2}")
# r2 = r2_score(Y_test, predictions)
print(f"MSE: {mse}, RMSE: {rmse}")
print(model.summary())


# ACF of Residuals and Q-Value
residuals = Y_test - predictions
ACF_PACF_Plot(residuals, 50)
plt.show()

# Calculate Q and Q-critical
acf_res = acf(residuals, nlags=30)
Q = len(Y_train)*np.sum(np.square(acf_res[30:]))
Expand All @@ -291,6 +308,7 @@ def mean_squared_error(y_true, y_pred):
plt.plot(Y_train, label='Train')
plt.plot(Y_test.index, Y_test, label='Test')
plt.plot(Y_test.index, predictions, label='Predictions')
plt.title("Regressions Predictions")
plt.legend()
plt.show()

Expand Down Expand Up @@ -381,6 +399,7 @@ def forecast_function(history, order, seasonal_order, forecast_periods):
print(sarima_forecast_values)
res_fc = Y_test - sarima_forecast_values
res_fc_var = np.var(res_fc)

#%%
##### Residual Analysis ######
## Whiteness Chi-square Test ##
Expand Down Expand Up @@ -421,29 +440,54 @@ def forecast_function(history, order, seasonal_order, forecast_periods):
#%%
# Residuals Variance
# Forecast Variance (get forcast errors, then calc variance)
print("Variance of the errors:", arma.resid.var())
print("Forecast Variance: ", res_fc_var)
#%%
## Model Simplification ##
root_check = np.roots([1, 0.7569])
print("Final coefficient(s): ", root_check[0])

#%%
##### h-Step Prediction ######
# skip this because the forecast function already handles h-step prediction
# model = sm.tsa.ARIMA(Y_train, order=(1, 0, 0))
# model_fit = model.fit()
# # h-step ahead forecast
# h = len(Y_test)
# forecast = model_fit.forecast(h) # Returns forecasted values
# plt.figure(figsize=(10, 6))
# plt.plot(Y_train, label='Train')
# plt.plot(Y_test, label='True Values')
# plt.plot(forecast, label='Predictions', color='red')
# plt.title('ARMA Model Predictions vs True Values')
# plt.xlabel('Time')
# plt.ylabel('Value')
# plt.legend()
# plt.show()
# root_check1 = np.roots([1,0.8102,-0.1820,0.0878,0.0174,
# 0.0267,0.0264,-0.0584,0.0988])
# print("Final coefficient(s): ", root_check1)

# %%
#### Final Model Selection ####
rmse_arma = np.sqrt(mean_squared_error(actual, sarima_forecast_values))
rmse_hw = np.sqrt(hw_mse)
from sklearn.metrics import r2_score

rmse_avg = np.sqrt(mse_avg)
rmse_naive = np.sqrt(mse_naive)
rmse_drift = np.sqrt(mse_drift)
rmse_simple = np.sqrt(mse_simple)
rmse_exp = np.sqrt(mse_exp)
model_metrics = {
'Naive': {'RMSE': rmse_naive},
'Average': {'RMSE': rmse_avg},
'Drift': {'RMSE': rmse_drift},
'Simple Smoothing': {'RMSE': rmse_simple},
'Exponential Smoothing': {'RMSE': rmse_exp},
'Holt-Winters': {'RMSE': rmse_hw},
'Regressions': {'RMSE': ols_rmse, 'R-squared': ols_r2},
'ARMA': {'RMSE': rmse_arma},
}
# Convert the dictionary to a DataFrame
df_model_metrics = pd.DataFrame(model_metrics).T
df_model_metrics = df_model_metrics.round(2)
print(df_model_metrics)
# %%
# Naive model as baseline
rmse_baseline = rmse_naive
def calculate_percentage_improvement(baseline_rmse, model_rmse):
return ((baseline_rmse - model_rmse) / baseline_rmse) * 100
# Adding percentage improvement to model_metrics
for model in model_metrics:
rmse_model = model_metrics[model]['RMSE']
model_metrics[model]['% Improvement'] = calculate_percentage_improvement(rmse_baseline, rmse_model)
# Convert the updated dictionary to a DataFrame
df_model_metrics = pd.DataFrame(model_metrics).T
# Round the values
df_model_metrics = df_model_metrics.round(2)
print(df_model_metrics)

# %%
Loading

0 comments on commit 9bbaaf3

Please sign in to comment.