upload code and ppt

elpham6 · Dec 19, 2023 · 9bbaaf3 · 9bbaaf3
1 parent 5d46928
commit 9bbaaf3
Show file tree

Hide file tree

Showing 4 changed files with 661 additions and 77 deletions.
diff --git a/Final Project README.txt b/Final Project README.txt
@@ -0,0 +1,7 @@
+Final Project README
+
+Code files include Final_Project_EP_TS.py (main file) and tools.py (toolbox).  
+
+Download the data, "energydata_complete.csv" to your local machine. Modify the code, if needed, to read the data from there. 
+
+The code file to recreate the result is Final_Project_EP_TS.py. Run the file once. If need to review again, please clear the console or interactive console (if using VSCode) and run the entire file again. 
diff --git a/FinalProjectPresentation.pptx b/FinalProjectPresentation.pptx
diff --git a/final project draft 2.py b/final project draft 2.py
@@ -10,24 +10,20 @@
 from statsmodels.tsa.arima_process import ArmaProcess
 from tools import ADF_Cal, Cal_rolling_mean_var, ACF_PACF_Plot, generate_arma,create_gpac_table, kpss_test
 from scipy.stats import chi2
-############## EDA #################
+
 #%%
+############## EDA #################
 # upload this to github later for hardcoding
 # url =
 df = pd.read_csv("energydata_complete.csv")
 print(df.head())
-
-# %%
 # count missing values
 clean_df = df.dropna()
 print(clean_df.info)
 # data has no missing values
-
-# %%
 # Setting a time column as the index
 clean_df['date'] = pd.to_datetime(clean_df['date'])
 clean_df.set_index('date', inplace=True)
-
 # separate X and Y
 Y = clean_df['Appliances']
 X = clean_df.drop('Appliances', axis=1)
@@ -40,26 +36,22 @@
 plt.ylabel('Value')
 plt.title('Energy vs Time')
 plt.show()
-# %%
 # Plot ACF/PACF
 ACF_PACF_Plot(Y, 50)
-
-# %%
 # Plotting Heatmap
 corr = clean_df.corr()
 plt.figure(figsize=(20, 20))
 sns.heatmap(corr, annot=True, cmap='coolwarm')
 plt.title("Correlation Matrix")
 plt.show()
-
 # %%
+##### Split Data #####
 from sklearn.model_selection import train_test_split
 # Split 80/20 train/test
 # Splitting the data into train and test sets
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, shuffle=False, random_state=6313)
 print(f"Training set size: {X_train.shape[0]}")
 print(f"Test set size: {X_test.shape[0]}")
-
 # %%
 from statsmodels.tsa.stattools import adfuller, kpss
 ######## Test stationarity #############
@@ -71,7 +63,6 @@
 # Rolling Mean and Variance
 Cal_rolling_mean_var(clean_df, "Appliances")
 # Data is stationary
-
 # %%
 ####### STL Decomp ########
 from statsmodels.tsa.seasonal import STL
@@ -100,30 +91,6 @@ def str_trend_seasonal(T, S, R):
 str_trend_seasonal(T,S,R)
 # no strong trend or seasonality
 
-# %%
-##### Holt-Winters method #####
-from statsmodels.tsa.holtwinters import ExponentialSmoothing
-from sklearn.metrics import mean_squared_error
-# Fit the Holt-Winters model
-hw_model = ExponentialSmoothing(Y_train, 
-                             trend='mul', seasonal = "mul",
-                             seasonal_periods = 144,
-                             damped=True).fit() # 144 10-min intervals a day
-# Make predictions
-hw_pred = hw_model.forecast(len(Y_test))
-
-# Evaluate the model
-hw_mse = mean_squared_error(Y_test, hw_pred)
-print('Mean Squared Error:', hw_mse)
-
-# Plotting the results
-plt.figure()
-Y_train.plot(label='Train')
-Y_test.plot(label='Test')
-hw_pred.plot(label='Holt-Winters Prediction')
-plt.legend()
-plt.show()
-
 # %%
 ###### Feature Selection #######
 # PCA
@@ -132,39 +99,77 @@ def str_trend_seasonal(T, S, R):
 
 # Standardize the data
 X_std = StandardScaler().fit_transform(X)  
-
 # Apply PCA
-pca = PCA(n_components=2)  # need to fix n_components
+pca = PCA(n_components=22)  
 principalComponents = pca.fit_transform(X_std)
-
 # To view explained variance
-pca.explained_variance_ratio_
+explained_variance = pca.explained_variance_ratio_
+print("Explained Variance by Each Principal Component:", explained_variance)
+plt.plot(np.cumsum(pca.explained_variance_ratio_))
+plt.xlabel('Number of Components')
+plt.ylabel('Cumulative Explained Variance')
+plt.title("PCA Explained Variance")
+plt.show()
+#%%
 # SVD
 U, s, VT = np.linalg.svd(X_std)
 # Cond. number
 cond_number = np.linalg.cond(X_std)
+print("The condition number is: ", cond_number)
+plt.figure
+plt.plot(s, marker='o')
+plt.title("Singular Values")
+plt.xlabel("Index")
+plt.ylabel("Value")
+plt.yscale('log')  
+plt.show()
+
+#%%
+### VIF ###
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from statsmodels.tools.tools import add_constant
 
+# Assuming 'X' is your DataFrame of predictors
+X_copy = X.copy()
+X_copy = add_constant(X_copy)  # Adding a constant column for the intercept
+
+# Create a DataFrame that will store VIFs
+vif_data = pd.DataFrame()
+vif_data["Feature"] = X.columns
+
+# Calculate VIF for each feature
+vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
+
+print(vif_data)
 #%% 
 # Backwards Stepwise
 import statsmodels.api as sm
 
-def backward_elimination(data, target, significance_level = 0.05):
+def backward_elimination(data, target, significance_level=0.05):
     features = data.columns.tolist()
-    while(len(features) > 0):
+    removed_features = []  # List to keep track of removed features
+
+    while len(features) > 0:
         features_with_constant = sm.add_constant(data[features])
-        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
-
+        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]  # Excluding the constant's p-value
         max_p_value = p_values.max()
+
         if max_p_value >= significance_level:
             excluded_feature = p_values.idxmax()
             features.remove(excluded_feature)
+            removed_features.append(excluded_feature)  # Store the removed feature
+            print(f"Removed: {excluded_feature}")  # Print the removed feature
         else:
             break
 
-    return features
+    return features, removed_features  # Return both the selected and removed features
 
-selected_features = backward_elimination(X, Y) 
-print(selected_features) 
+# Apply the function to your data
+selected_features, eliminated_features = backward_elimination(X, Y)
+print("Selected Features:", selected_features)
+print("Eliminated Features:", eliminated_features)
+X_train = X_train[selected_features]
+X_test = X_test[selected_features]
 
 # %%
 ###### Baseline Model ######
@@ -198,17 +203,13 @@ def ses_forecast(train, alpha, h):
 alpha = 0.5
 # Average Model
 avg = average_forecast(Y_train, h)
-
 # Naïve Model
 naive = naive_forecast(Y_train, h)
-
 # Drift Model
 drift = drift_forecast(Y_train, h)
-
 # Simple Smoothing
 simple_model = SimpleExpSmoothing(Y_train[:-h]).fit(smoothing_level=0.2)
 simple_forecast = simple_model.forecast(h)
-
 # Exponential Smoothing
 exp_model = ExponentialSmoothing(Y_train[:-h], trend='add', seasonal='add', seasonal_periods=144).fit()
 exp_forecast = exp_model.forecast(h)
@@ -227,20 +228,40 @@ def mean_squared_error(y_true, y_pred):
     return mse
 
 actual = Y_test
-
 # Calculate MSE for each model
 mse_avg = mean_squared_error(actual, avg)
 mse_naive = mean_squared_error(actual, naive)
 mse_drift = mean_squared_error(actual, drift)
 mse_simple = mean_squared_error(actual, simple_forecast)
 mse_exp = mean_squared_error(actual, exp_forecast)
-
 # Print MSE values
 print("MSE - Average:", mse_avg)
 print("MSE - Naive:", mse_naive)
 print("MSE - Drift:", mse_drift)
 print("MSE - Simple Smoothing:", mse_simple)
 print("MSE - Exponential Smoothing:", mse_exp)
+
+# %%
+##### Holt-Winters method #####
+from statsmodels.tsa.holtwinters import ExponentialSmoothing
+from sklearn.metrics import mean_squared_error
+# Fit the Holt-Winters model
+hw_model = ExponentialSmoothing(Y_train, 
+                             trend='add', seasonal = "add",
+                             seasonal_periods = 144,
+                             damped=True).fit() # 144 10-min intervals a day
+# Make predictions
+hw_pred = hw_model.forecast(len(Y_test))
+# Evaluate the model
+hw_mse = mean_squared_error(Y_test, hw_pred)
+print('Mean Squared Error (HW Method):', hw_mse)
+# Plotting the results
+plt.figure()
+Y_train.plot(label='Train')
+Y_test.plot(label='Test')
+hw_pred.plot(label='Holt-Winters Prediction')
+plt.legend()
+plt.show()
 #%%
 ###### Regression ######
 from sklearn.metrics import mean_squared_error, r2_score
@@ -250,24 +271,20 @@ def mean_squared_error(y_true, y_pred):
 X_selected_test = X_test[selected_features]
 # Cross-Validation
 from sklearn.model_selection import TimeSeriesSplit
-
 # Fit the model
 model = sm.OLS(Y_train, sm.add_constant(X_selected_train)).fit()
 # One-step ahead prediction
 predictions = model.predict(sm.add_constant(X_selected_test))
 # Model Evaluation
 mse = mean_squared_error(Y_test, predictions)
 rmse = np.sqrt(mse)
-r2 = r2_score(Y_test, predictions)
-print(f"MSE: {mse}, RMSE: {rmse}, R-squared: {r2}")
+# r2 = r2_score(Y_test, predictions)
+print(f"MSE: {mse}, RMSE: {rmse}")
 print(model.summary())
-
-
 # ACF of Residuals and Q-Value
 residuals = Y_test - predictions
 ACF_PACF_Plot(residuals, 50)
 plt.show()
-
 # Calculate Q and Q-critical
 acf_res = acf(residuals, nlags=30)
 Q = len(Y_train)*np.sum(np.square(acf_res[30:]))
@@ -291,6 +308,7 @@ def mean_squared_error(y_true, y_pred):
 plt.plot(Y_train, label='Train')
 plt.plot(Y_test.index, Y_test, label='Test')
 plt.plot(Y_test.index, predictions, label='Predictions')
+plt.title("Regressions Predictions")
 plt.legend()
 plt.show()
 
@@ -381,6 +399,7 @@ def forecast_function(history, order, seasonal_order, forecast_periods):
 print(sarima_forecast_values)
 res_fc = Y_test - sarima_forecast_values
 res_fc_var = np.var(res_fc)
+
 #%%
 ##### Residual Analysis ######
 ## Whiteness Chi-square Test ##
@@ -421,29 +440,54 @@ def forecast_function(history, order, seasonal_order, forecast_periods):
 #%%
 # Residuals Variance
 # Forecast Variance (get forcast errors, then calc variance)
+print("Variance of the errors:", arma.resid.var())
 print("Forecast Variance: ", res_fc_var)
 #%%
 ## Model Simplification ##
 root_check = np.roots([1, 0.7569])
 print("Final coefficient(s): ", root_check[0])
-
-#%%
-##### h-Step Prediction ######
-# skip this because the forecast function already handles h-step prediction
-# model = sm.tsa.ARIMA(Y_train, order=(1, 0, 0))  
-# model_fit = model.fit()
-# # h-step ahead forecast
-# h = len(Y_test)
-# forecast = model_fit.forecast(h)  # Returns forecasted values
-# plt.figure(figsize=(10, 6))
-# plt.plot(Y_train, label='Train')
-# plt.plot(Y_test, label='True Values')
-# plt.plot(forecast, label='Predictions', color='red')
-# plt.title('ARMA Model Predictions vs True Values')
-# plt.xlabel('Time')
-# plt.ylabel('Value')
-# plt.legend()
-# plt.show()
+# root_check1 = np.roots([1,0.8102,-0.1820,0.0878,0.0174,
+#                        0.0267,0.0264,-0.0584,0.0988])
+# print("Final coefficient(s): ", root_check1)
 
 # %%
 #### Final Model Selection ####
+rmse_arma = np.sqrt(mean_squared_error(actual, sarima_forecast_values))
+rmse_hw = np.sqrt(hw_mse)
+from sklearn.metrics import r2_score
+
+rmse_avg = np.sqrt(mse_avg)
+rmse_naive = np.sqrt(mse_naive)
+rmse_drift = np.sqrt(mse_drift)
+rmse_simple = np.sqrt(mse_simple)
+rmse_exp = np.sqrt(mse_exp)
+model_metrics = {
+    'Naive': {'RMSE': rmse_naive},
+    'Average': {'RMSE': rmse_avg},
+    'Drift': {'RMSE': rmse_drift},
+    'Simple Smoothing': {'RMSE': rmse_simple},
+    'Exponential Smoothing': {'RMSE': rmse_exp},
+    'Holt-Winters': {'RMSE': rmse_hw},
+    'Regressions': {'RMSE': ols_rmse, 'R-squared': ols_r2},
+    'ARMA': {'RMSE': rmse_arma},
+}
+# Convert the dictionary to a DataFrame
+df_model_metrics = pd.DataFrame(model_metrics).T
+df_model_metrics = df_model_metrics.round(2)
+print(df_model_metrics)
+# %%
+# Naive model as baseline
+rmse_baseline = rmse_naive
+def calculate_percentage_improvement(baseline_rmse, model_rmse):
+    return ((baseline_rmse - model_rmse) / baseline_rmse) * 100
+# Adding percentage improvement to model_metrics
+for model in model_metrics:
+    rmse_model = model_metrics[model]['RMSE']
+    model_metrics[model]['% Improvement'] = calculate_percentage_improvement(rmse_baseline, rmse_model)
+# Convert the updated dictionary to a DataFrame
+df_model_metrics = pd.DataFrame(model_metrics).T
+# Round the values
+df_model_metrics = df_model_metrics.round(2)
+print(df_model_metrics)
+
+# %%