-
Notifications
You must be signed in to change notification settings - Fork 0
/
final_script.Rmd
1197 lines (829 loc) · 65.8 KB
/
final_script.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "An Exploratory Data Analysis on Airline Customer Satisfaction"
author: "Parv Bhargava, Jehan Bugli, Venkata Madisetty, and Namratha Prakash"
date: "2023-12-16"
output: rmdformats::downcute
---
```{r init, include=FALSE}
# Load the 'psych' package for various functions related statistics
library(psych)
# Load the 'readr' package for efficient reading of data files
library(readr)
# Load the 'forcats' package for handling categorical variables
library(forcats)
# Load the 'gridExtra' package for arranging multiple plots on a grid
library(gridExtra)
# Load the 'RColorBrewer' package for color palettes
library(RColorBrewer)
# Load the 'usdm' package for data mining and analysis
library(usdm)
# Load the 'ezids' package
library(ezids)
# Load the 'Hmisc' package for various functions, including 'describe'
library(Hmisc)
# Load the 'ggplot2' package for creating data visualizations using the Grammar of Graphics
library(ggplot2)
# Load the 'dplyr' package for data manipulation and transformation
library(dplyr)
# Load the 'car' package for functions related to regression diagnostics, including VIF
library(car)
# Load the 'corrplot' package for visualizing correlation matrices
library(corrplot)
# Load the 'kableExtra' package for advanced table formatting with 'kable'
library(kableExtra)
# Load the 'knitr' package for dynamic report generation
library(knitr)
# Load the 'lmtest' and 'sandwich' packages to help with linear model testing
library(lmtest)
library(sandwich)
# For constructing and plotting decision tree models.
library(rpart)
library(rpart.plot)
# Provides functions for training and plotting models, and performing cross-validation.
library(caret)
# For converting statistical analysis objects into tidy format.
library(broom)
# A collection of R packages for data manipulation and visualization.
library(tidyverse)
# Contains functions and datasets to support Venables and Ripley's MASS book.
library(MASS)
# For evaluating and visualizing classifier performance.
library(ROCR)
library(pROC)
```
```{r setup, include=FALSE}
# some of common options (and the defaults) are:
# include=T, eval=T, echo=T, results='hide'/'asis'/'markup',..., collapse=F, warning=T, message=T, error=T, cache=T, fig.width=6, fig.height=4, fig.dim=c(6,4) #inches, fig.align='left'/'center','right',
# knitr::opts_chunk$set(warning = F, results = "markup", message = F)
knitr::opts_chunk$set(warning = F, message = F)
# knitr::opts_chunk$set(include = F)
# knitr::opts_chunk$set(echo = TRUE)
options(scientific=T, digits = 3)
# options(scipen=9, digits = 3)
```
```{r scrollbar_verticle, include=FALSE}
options(width = 60)
local({
hook_output <- knitr::knit_hooks$get('output')
knitr::knit_hooks$set(output = function(x, options) {
if (!is.null(options$max.height)) options$attr.output <- c(
options$attr.output,
sprintf('style="max-height: %s;"', options$max.height)
)
hook_output(x, options)
})
})
```
# Introduction
Airline passenger satisfaction is a crucial metric for firms in the airline industry. Understanding the factors that contribute to customer satisfaction is essential for airlines to improve their services and compete effectively; high market saturation, as well as low profit margins, can magnify the effects of small advantages or disadvantages relative to other firms (Lutz et al., 2012; Hardee, 2023). In this research, we will analyze various factors that affect airline passenger satisfaction and, ultimately, judge their suitability for a regression model predicting passenger satisfaction. We will leverage a Kaggle dataset that includes surveyed passenger characteristics, flight details, and satisfaction ratings for select pre-flight and in-flight components (Klein, 2020). To ensure modeling suitability, we will conduct exploratory data analysis, taking into account variable distributions and types.
With our research, we aim to answer a few main questions. For one, **to what extent do certain surveyed passenger characteristics and flight experience components impact the likelihood that a passenger will be satisfied – rather than neutral or dissatisfied – with their trip?** This is the key focus of our research; we want to identify meaningful inputs for satisfaction and estimate the magnitude of their effects. Secondly, **how can we model the likelihood of passenger satisfaction using surveyed passenger characteristics and flight experience components in a manner that minimizes predictive bias?** While assembling our models, we need to ensure that issues such as multicollinearity and overfitting do not jeopardize our models' predictive validity. Finally, **to what extent can we predict the likelihood that a flight passenger will be satisfied with their experience using multiple different variable levels?** Our dataset utilizes continuous, ordinal, and categorical variables, all of which can require differing assumptions when used in modeling; incorporating these different variable levels into a model is an important step in predicting satisfaction.
The dataset for our research on airline passenger satisfaction contains various variables, which can be categorized into three types: continuous, categorical, and ordinal. Continuous variables include passenger age, flight distance, arrival delays, and departure delays. Categorical variables include gender, customer type (loyalty), the type of travel (business or personal), and the travel class (business, economy, or economy plus). Ordinal variables include a number of ratings from 0-5 concerning specific aspects of the flight experience. The "Satisfaction" variable represents the airline passenger's satisfaction level and includes two categories: "satisfied" or "neutral or dissatisfied." This will be our primary outcome variable for analysis.
### Variable limitations
While the analysis and insight generation opportunities are manyfold, certain fields in this dataset can present challenges limiting a resulting model's predictive validity. One critical issue is data collection; While some variable-related documentation is available, **we are not able to discern the circumstances under which this survey was distributed using the Kaggle source** (Klein, 2020). The population may have been sampled through certain methods—such as convenience sampling—that make resulting data less representative of the overall population despite the large observation count. The overall population in question also is not clear; the survey may have focused on a particular airport or region, limiting potential predictive validity in alternative settings.
Another issue is that **the document does not elaborate upon what counts as a "loyal" or "disloyal" customer for the customer type field**. This makes it difficult to properly interpret the effects of such a variable in a regression model. The threshold for disloyalty could potentially range from using any other airlines at all to using other airlines a majority of the time, drastically altering any potential real-world applications.
A third—but not final—problematic factor is that **ticket prices are not included in this survey**, with class serving as a rough proxy; intuitively, such prices could play a major factor in passengers' service expectations and their subsequent ratings. The lack of price ranges associated with seat class also makes it difficult to encode the three categories in a way that accurately captures the disparity.
```{r data,echo=FALSE}
# data <- read.csv("data\\train.csv")
# data_test <- read.csv("data\\test.csv")
data <- read.csv("data\\train.csv")
data_test <- read.csv("data\\test.csv")
xkabledplyhead(data, pos = "left", bso = "striped")
```
#### Data structure
```{r structure of the data,echo=FALSE}
str(data)
```
#### Data dimensions
```{r dimension of the data,include=FALSE}
dims <- dim(data)
dims
```
This is a data frame with **`r dims[1]`** observations (rows) and **`r dims[2]`** variables (columns). Assuming that a robust sampling method was utilized, the large number of observations may allow us to conclude that the data is generally representative of the actual population.
#### An initial description of the data
```{r description,max.height = "500px",echo=FALSE }
describe(data)
```
## Data pre-processing
#### **Duplicate values**
```{r data_preprocessing_obs, echo=FALSE}
# Check for missing values
missing_data <- data %>%
summarise_all(~ sum(is.na(.)))
# Check for duplicated rows
duplicate_rows <- data %>%
summarise(n_duplicates = sum(duplicated(.)))
```
We first imported the data into R by using `read.csv()` function. The first few rows in the dataset are included above. This is a data frame with **`r dims[1]`** observations (rows) and **`r dims[2]`** variables (columns). Assuming that a robust sampling method was utilized, the large number of observations may allow us to conclude that the data is generally representative of the actual population. However, the data required some cleaning before use in testing. One issue was that the arrival delays field included a number of NA values; we elected to replace these with the median delay. This method was used over other potential replacement options, such as the average, due to the skewed distribution of values detailed later on. Apart from that, ratings responses equaling 0 indicate that the question was not applicable; respondents that select this option for any of the ratings variables are filtered out to ensure that all of the individual ratings are relevant for all observations. While alternatives exist, such as replacement, the large number of initial observations limited our concerns over a potential loss in predictive validity. All steps were repeated for both the training and testing datasets.
```{r data preprocessing, echo=FALSE}
# Get unnecessary columns
drop <- c("X","id")
# Drop column names specified in vector
data <- data[,!(names(data) %in% drop)]
#Select ratings columns
selected_columns <- 7:20
# Check if any ratings include zeros (representing N/A)
has_zeros <- apply(data[selected_columns], 1, function(row) any(row == 0))
# Remove rows with zeros in the selected columns
data <- data[!has_zeros, ]
#Remove NA values which we acquired previously
data$Arrival.Delay.in.Minutes[is.na(data$Arrival.Delay.in.Minutes)] <- median(data$Arrival.Delay.in.Minutes, na.rm = TRUE)
missing_data <- data %>%
summarise_all(~ sum(is.na(.)))
```
```{r data preprocessing_test, echo=FALSE}
# Get unnecessary columns
drop <- c("X","id")
# Drop column names specified in vector
data_test <- data_test[,!(names(data_test) %in% drop)]
#Select ratings columns
selected_columns <- 7:20
# Check if any ratings include zeros (representing N/A)
has_zeros <- apply(data_test[selected_columns], 1, function(row) any(row == 0))
# Remove rows with zeros in the selected columns
data_test <- data_test[!has_zeros, ]
#Remove NA values which we acquired previously
data_test$Arrival.Delay.in.Minutes[is.na(data_test$Arrival.Delay.in.Minutes)] <- median(data_test$Arrival.Delay.in.Minutes, na.rm = TRUE)
missing_data <- data_test %>%
summarise_all(~ sum(is.na(.)))
```
```{r summary_stats, include=FALSE}
# Summary statistics for numeric fields
numeric_fields <- c("Age", "Flight.Distance", "Departure.Delay.in.Minutes", "Arrival.Delay.in.Minutes")
summary_stats_numeric <- summary(data[numeric_fields])
# Summary statistics for categorical fields
categorical_fields <- c("Gender", "Customer.Type", "Type.of.Travel", "Class",
"Inflight.wifi.service", "Departure.Arrival.time.convenient", "Ease.of.Online.booking",
"Gate.location", "Food.and.drink", "Online.boarding", "Seat.comfort",
"Inflight.entertainment", "On.board.service", "Leg.room.service", "Baggage.handling",
"Checkin.service", "Inflight.service", "Cleanliness", "satisfaction")
summary_stats_categorical <- data %>%
summarise(across(all_of(categorical_fields),
list(n = ~ length(.), n_distinct = ~ length(unique(.)), top_freq = ~ names(sort(table(.), decreasing = TRUE)[1]))
))
```
# Examining variable distributions
Following data pre-processing, we plotted variable distributions to attempt to identify potential trends and correlations. Given a robust sampling method, we can safely assume that these distributions (including the highly skewed ones) are representative of the overall population. Initially, none of the categorical fields appear to be highly correlated, but we intend to confirm this using variance inflation factor (VIF) analysis following initial model creation ("vif: Variance Inflation Factors", n.d.). Looking at the distribution of class, Eco Plus has a significantly lower observation frequency than the other two. In addition, as noted earlier, the magnitudes of increments between Eco, Eco Plus, and Business are not clear; we noted that some transformation may be required later to ensure modeling suitability.
When plotting continuous variable distributions, flight distance as well as both delay variables have a strong right skew. This makes sense intuitively; we would expect most flights to have minimal to no delays, and shorter flights are likely more frequent. Age appears to be bimodal to a degree, with a small peak around 20-25 and another peak roughly around 35-50. Depending on the type of regression that is ultimately selected, some of these variables may require aggressive transformations to better approximate normal distributions. Many of the distributions for individual ratings variables look quite similar, raising multicollinearity concerns that will be addressed later.
#### Frequency distributions for categorical variables
```{r histograms_categorical, fig.width=15, fig.height=15, echo=FALSE}
categorical_vars_ggplot <- c('Gender', 'Customer.Type', 'Type.of.Travel', 'Class', 'satisfaction')
plot_list <- list()
for (cat_var in categorical_vars_ggplot) {
plot_obj <- ggplot(data, aes_string(x = cat_var, fill = cat_var)) +
geom_bar() +
geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=1.2, size = 6) +
labs(title = paste("\n Distribution of", cat_var,"\n"), x = cat_var, y = "Count") +
scale_fill_brewer(palette="Set3") +
theme_minimal() + # Apply a minimal theme
theme(
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 20),
) +
theme(legend.position="none") +
theme(panel.margin = margin(0, 0, 20, 0))
plot_list[[cat_var]] <- plot_obj
}
grid.arrange(grobs = plot_list, ncol = 2)
```
```{r variables, include=FALSE}
numerical_vars_ggplot <- c('Age', 'Flight.Distance', 'Inflight.wifi.service', 'Departure.Arrival.time.convenient', 'Ease.of.Online.booking', 'Gate.location', 'Food.and.drink', 'Online.boarding', 'Seat.comfort', 'Inflight.entertainment', 'On.board.service', 'Leg.room.service', 'Baggage.handling', 'Checkin.service', 'Inflight.service', 'Cleanliness', 'Departure.Delay.in.Minutes', 'Arrival.Delay.in.Minutes')
continuous_vars <- c('Age', 'Flight.Distance', 'Departure.Delay.in.Minutes', 'Arrival.Delay.in.Minutes')
```
#### Frequency distributions for continuous variables
```{r continous_variables, fig.width=10, fig.height=10, echo=FALSE}
continuous_plots <- list()
for (num_var in continuous_vars) {
plot_obj <- ggplot(data, aes_string(x = num_var)) +
geom_histogram(bins = 40, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency") +
theme_minimal()
continuous_plots <- append(continuous_plots, list(plot_obj))
}
if (length(continuous_plots) > 0) {
grid.arrange(grobs = continuous_plots, ncol = 2)
}
```
#### Frequency distributions for ordinal variables (Ratings)
```{r rating_plots, fig.width=50, fig.height=100, echo=FALSE}
rating_vars <- c('Inflight.wifi.service', 'Departure.Arrival.time.convenient', 'Ease.of.Online.booking', 'Gate.location', 'Food.and.drink', 'Online.boarding', 'Seat.comfort', 'Inflight.entertainment', 'On.board.service', 'Leg.room.service', 'Baggage.handling', 'Checkin.service', 'Inflight.service', 'Cleanliness')
rating_plots <- list()
for (rate_var in rating_vars) {
plot_obj <- ggplot(data, aes_string(x = rate_var)) +
geom_bar(fill = "lightblue" , position = "dodge") +
geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=1.2, size = 20) +
labs(title = paste("\nDistribution of", rate_var,"\n"), x = rate_var, y = "Frequency")+
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="none",
plot.title = element_text(size = 54, face = "bold"),
axis.title.x = element_text(size = 52, face = "bold"),
axis.title.y = element_text(size = 52, face = "bold"),
axis.text.x = element_text(size = 50),
axis.text.y = element_text(size = 50))
rating_plots <- append(rating_plots, list(plot_obj))
}
if (length(rating_plots) > 0) {
grid.arrange(grobs = rating_plots, ncol = 2 ,)
}
```
# Distributions with respect to satisfaction
We also used plots to visually discern differences in continuous variables between satisfied and unsatisfied groups, potentially revealing significant model inputs. The first step was to use box-plots for continuous variables. We found that older passengers tend to be more satisfied with their flights compared to their younger counterparts. Also, on average, passengers who embark on longer journeys tend to report higher levels of satisfaction. The basis for this trend is unclear at this time, but further investigation may yield actionable conclusions in this regard. Flights experiencing greater departure delays appear to have a slightly higher proportion of neutral or dissatisfied customers, which supports the intuition that prolonged delays before takeoff may negatively affect passenger contentment. Similarly to departure delays, flights with higher arrival delays tend to exhibit a marginally increased prevalence of neutral or dissatisfied customers. This underscores the potential impact of delays—both at departure and arrival—on passenger satisfaction, although more investigation was required to uncover the exact nature of this relationship. A scatterplot uncovered potential multicollinearity concerns to be addressed later.
Histograms for categorical variables uncovered a distinct trend in terms of customer loyalty. Loyal customers, those who have a history of repeat business with the airline, tend to report higher levels of satisfaction compared to disloyal or infrequent flyers. There is a significant satisfaction discrepancy between individuals traveling for business and personal reasons; a majority of business travelers were satisfied, while an overwhelming proportion of personal travelers expressed dissatisfaction or neutrality. The nature of this relationship, as well as actionable insights that may be drawn from it, are unclear at this point. Business class passengers stand out as notably more satisfied than those in Economy or Economy Plus. If proven to be statistically significant, this factor could spur class-specific service and amenity adjustments for efficient satisfaction gains. It might also warrant future study detailing meaningful distinctions in the flight experience between classes. The notable exception here is gender, across which there were no notable differences in satisfaction.
#### Continuous variable boxplots
```{r box_plot, fig.width=10, fig.height=10, echo=FALSE}
histogram_vars <- c('Age', 'Flight.Distance', 'Departure.Delay.in.Minutes', 'Arrival.Delay.in.Minutes')
boxplot_list <- list()
for (num_var in histogram_vars) {
plot_obj <- ggplot(data, aes_string(x = "satisfaction", y = num_var)) +
geom_boxplot(fill = "turquoise", color = "black", alpha = 0.7) +
labs(title = paste("Box plot", num_var),
x = "Satisfaction", y = num_var) +
theme_minimal() +
theme(axis.text = element_text(size=10),
axis.title = element_text(size=10, face="bold"),
plot.title = element_text(hjust = 0.3))
boxplot_list <- append(boxplot_list, list(plot_obj))
}
grid.arrange(grobs = boxplot_list, ncol = 2)
```
#### Categorical variable histograms
```{r target_variable_histograms, echo=FALSE}
categorical_features <- c('Gender', 'Customer.Type', 'Type.of.Travel', 'Class')
par(mfrow=c(2,2), mar=c(4,4,2,2))
for (feature in categorical_features) {
p <- ggplot(data, aes_string(x=feature, fill='satisfaction')) +
geom_bar(position="dodge") +
geom_text(stat='count', aes(label=..count..), vjust= 1.2, position=position_dodge(width=0.9)) +
labs(title = paste("\nDistribution of", gsub("`", "", feature), "by Satisfaction"), x = gsub("`", "", feature), y = "Count") +
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="top")
print(p)
}
```
#### Continuous variable KDE (Kernel Density Estimation) plots
```{r numeric_dist, echo=FALSE}
numerical_features <- c('Age', 'Flight.Distance', 'Departure.Delay.in.Minutes','Arrival.Delay.in.Minutes')
par(mfrow=c(2,2), mar=c(4,4,2,2))
for (feature in numerical_features) {
p <- ggplot(data, aes_string(x=feature, fill='satisfaction')) +
geom_density(alpha=0.5, position="identity") +
labs(title = paste("Distribution of", feature, "by Satisfaction"), x = feature, y = "Density") +
scale_fill_manual(values=c("satisfied"="green", "neutral or dissatisfied"="red")) +
theme_minimal() +
theme(legend.position="top")
print(p)
}
```
#### Arrival and departure delay scatterplot
```{r scatter_plots, echo=FALSE}
# just 5000 rows
data_sample <- data[sample(nrow(data), 5000), ]
# Scatter plot
ggplot(data_sample, aes(x=Departure.Delay.in.Minutes, y=Arrival.Delay.in.Minutes, color=satisfaction)) +
geom_point(alpha=0.7) +
scale_color_manual(values=c("neutral or dissatisfied"="red", "satisfied"="green")) +
labs(title="\nRelationship between Age and Flight Distance by Satisfaction\n") +
theme_minimal()
```
# Correlation matrices
Our final EDA step was to example multicollinearity; to accomplish this, we built two correlation matrices for continuous and ordinal (ratings) variables respectively. As observed earlier, arrival and departure delays appear to be highly correlated; certain steps, such as removing one of the two or calculating an average delay variable, would likely be necessary for use in a predictive model. We also found that certain ratings variables have strong positive correlations with each other. If these are included in the model without adjustments, our model may suffer a loss in reliability. In order to avoid this issue, we elected to combine ratings variables into two groups—based on the degree of correlation—and utilize average ratings from these two groups as model inputs.
#### Continuous variable correlations
```{r Numerical_Corr_Analysis, echo=FALSE}
data_cor <- cor(subset(data,select = c(Age, Flight.Distance,Departure.Delay.in.Minutes,Arrival.Delay.in.Minutes)))
summary(data_cor)
options(repr.plot.width = 14, repr.plot.height = 8)
corrplot(data_cor, na.label = " ", method="circle", type = "upper",tl.col = "black", tl.cex = 1)
```
#### Ratings variable correlations
```{r fig.width=10, fig.height=10, echo=FALSE}
categorical_fields <- c("Inflight.wifi.service", "Departure.Arrival.time.convenient", "Ease.of.Online.booking",
"Gate.location", "Food.and.drink", "Online.boarding", "Seat.comfort",
"Inflight.entertainment", "On.board.service", "Leg.room.service", "Baggage.handling",
"Checkin.service", "Inflight.service", "Cleanliness")
correlation_matrix <- cor(data[categorical_fields])
options(repr.plot.width=100, repr.plot.height=80)
corrplot(correlation_matrix, method = "circle", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 90, addCoef.col = "black", number.cex = 0.9,
cl.cex = 0.9)
```
#### Aggregated ratings variable inclusions and summary statistics
| **Ratings Group 1: Pre-Flight & Wi-Fi** | **Ratings Group 2: In-Flight & Baggage** |
|:---------------------------------------|:---------------------------------------|
| In-Flight Wifi Service | Food and Drink |
| Departure / Arrival Time | Seat Comfort |
| Ease of Online Booking | In-Flight Entertainment |
| Gate Location | Onboard Service |
| Online Boarding | Leg Room Service |
| | Baggage Handling |
| | Check-In Service |
| | In-Flight Service |
| | Cleanliness |
```{r ratings_combine, echo=FALSE}
# # Select columns for Group1
# ratings_group1 <- select(data, Inflight.wifi.service, Departure.Arrival.time.convenient, Ease.of.Online.booking, Gate.location, Online.boarding)
ratings_group1 <- data[, c("Inflight.wifi.service", "Departure.Arrival.time.convenient",
"Ease.of.Online.booking", "Gate.location", "Online.boarding")]
# Calculate the average for Group1
data$Pre_Flight_and_WiFi_Ratings <- rowMeans(ratings_group1, na.rm = TRUE)
# Select columns for Group2
# ratings_group2 <- select(data, Food.and.drink, Seat.comfort, Inflight.entertainment, On.board.service, Leg.room.service, Baggage.handling, Checkin.service, Inflight.service, Cleanliness)
# Assuming 'data' is your data frame and you want to select the mentioned columns
ratings_group2 <- data[, c("Food.and.drink", "Seat.comfort", "Inflight.entertainment",
"On.board.service", "Leg.room.service", "Baggage.handling",
"Checkin.service", "Inflight.service", "Cleanliness")]
# Calculate the average for Group2
data$In_Flight_and_Baggage_Ratings <- rowMeans(ratings_group2, na.rm = TRUE)
data_ratings_combined <- data[c("Pre_Flight_and_WiFi_Ratings","In_Flight_and_Baggage_Ratings")]
summary(data_ratings_combined)
```
# Probability and standard OLS estimates
Before engaging in further analysis, we first identified that satisfaction—as a categorical/binary variable—runs into a fundamental interpretation issue under a standard linear model, where **the standard linear model is not bounded between 0 and 1 in the same manner as our satisfaction variable**. Under certain inputs, the linear model predicts unattainable values between satisfied or neutral/dissatisfied (encoded as 1 and 0 respectively), and key assumptions of linearity and homoskedasticity are violated.
Despite this restriction, linear probability models remain in widespread use, particularly among social scientists, making this a potentially fruitful avenue for a predictive model (Allison, 2015). This largely stems from ease of interpretation and generation; unlike logit (to be discussed later), this directly predicts changes in probability rather than odds ratios, is easier to run, and approximates logit for the 0.2-0.8 probability range in most cases (Allison, 2020). We generated a linear model and used a t-test with robust standard errors to account for violated homoskedasticity assumptions.
```{r encoding, echo=FALSE}
# Encode the satisfaction variable as 1/0 to use temporarily for JB analysis section, should be removed since this will be closer to the start in the full script
data$satisfaction <- ifelse(data$satisfaction == "satisfied", 1, 0)
data$Gender <- ifelse(data$Gender == "Male", 1, 0)
data$Customer.Type <- ifelse(data$Customer.Type == "Loyal Customer", 1, 0)
data$Type.of.Travel <- ifelse(data$Type.of.Travel == "Business travel", 1, 0)
data$Class <- ifelse(data$Class %in% c("Eco", "Eco Plus"), 0,
ifelse(data$Class == "Business", 1, NA))
```
```{r linear_model_creation, fig.width = 15, fig.height = 15, fig.align = 'center', echo=FALSE}
linear_model <- lm(satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel + Class + Flight.Distance + Pre_Flight_and_WiFi_Ratings + In_Flight_and_Baggage_Ratings + Arrival.Delay.in.Minutes, data = data)
summary(linear_model)
coeftest(linear_model, vcov = vcovHC(linear_model, type="HC1"))
```
Based on our linear model, all inputs apart from gender and age have statistically significant impacts on satisfaction likelihood. As mentioned earlier, one major advantage from the linear model is that coefficients can be easily interpreted. For instance, loyal customers display a 0.357 (35.7%) increase in predicted satisfaction probability relative to others. In a similar vein, the model predicts a 43.5% higher satisfaction probability for passengers traveling for business relative to others. For the non-binary aggregated ratings, a 1-point increase corresponds to 9.07% and 22.9% predicted satisfaction probability increases for the pre-flight and in-flight groups respectively.
However, to confirm that the linear model is indeed a practically valuable predictor, we can't rely solely on the dataset used for training; our source provides a second testing dataset for which we can repeat cleaning/encoding steps and apply our model. Since gender and age are not significant, we elected to remove them prior to this step (marking this as a "v2" model). Using a confusion matrix, we determined that the v2 model's "accuracy"—the proportion of correctly predicted satisfaction values out of all respondents—is over 80% for the testing dataset. Based on this information, we can conclude that the linear model is a reasonably good predictor that isn't overfitting the training data.
```{r linear_model_v2, fig.width = 15, fig.height = 15, fig.align = 'center', echo=FALSE}
linear_model_v2 <- lm(satisfaction ~ Customer.Type + Type.of.Travel + Class + Flight.Distance + Pre_Flight_and_WiFi_Ratings + In_Flight_and_Baggage_Ratings + Arrival.Delay.in.Minutes, data = data)
summary(linear_model_v2)
```
```{r data_test_cleaning,echo=FALSE}
# Check for missing values
missing_data <- data_test %>%
summarise_all(~ sum(is.na(.)))
# Check for duplicated rows
duplicate_rows <- data_test %>%
summarise(n_duplicates = sum(duplicated(.)))
# Get unnecessary columns
drop <- c("X","id")
# Drop column names specified in vector
data_test <- data_test[,!(names(data_test) %in% drop)]
# Select ratings columns
selected_columns <- 7:20
# Check if any ratings include zeros (representing N/A)
has_zeros <- apply(data_test[selected_columns], 1, function(row) any(row == 0))
# Remove rows with zeros in the selected columns
data_test <- data_test[!has_zeros, ]
# Remove NA values which we acquired previously
data_test$Arrival.Delay.in.Minutes[is.na(data_test$Arrival.Delay.in.Minutes)] <- median(data_test$Arrival.Delay.in.Minutes, na.rm = TRUE)
missing_data <- data_test %>%
summarise_all(~ sum(is.na(.)))
# Repeat encoding steps
data_test$satisfaction <- ifelse(data_test$satisfaction == "satisfied", 1, 0)
data_test$Gender <- ifelse(data_test$Gender == "Male", 1, 0)
data_test$Customer.Type <- ifelse(data_test$Customer.Type == "Loyal Customer", 1, 0)
data_test$Type.of.Travel <- ifelse(data_test$Type.of.Travel == "Business travel", 1, 0)
data_test$Class <- ifelse(data_test$Class %in% c("Eco", "Eco Plus"), 0,
ifelse(data_test$Class == "Business", 1, NA))
# Repeat ratings aggregation steps
# Select columns for Group1
# ratings_group1_test <- select(data_test, Inflight.wifi.service, Departure.Arrival.time.convenient, Ease.of.Online.booking, Gate.location, Online.boarding)
ratings_group1_test <- data_test[, c("Inflight.wifi.service", "Departure.Arrival.time.convenient",
"Ease.of.Online.booking", "Gate.location", "Online.boarding")]
# Calculate the average for Group1
data_test$Pre_Flight_and_WiFi_Ratings <- rowMeans(ratings_group1_test, na.rm = TRUE)
# Select columns for Group2
# ratings_group2_test <- select(data_test, Food.and.drink, Seat.comfort, Inflight.entertainment, On.board.service, Leg.room.service, Baggage.handling, Checkin.service, Inflight.service, Cleanliness)
ratings_group2_test <- data_test[, c("Food.and.drink", "Seat.comfort", "Inflight.entertainment",
"On.board.service", "Leg.room.service", "Baggage.handling",
"Checkin.service", "Inflight.service", "Cleanliness")]
# Calculate the average for Group2
data_test$In_Flight_and_Baggage_Ratings <- rowMeans(ratings_group2_test, na.rm = TRUE)
data_ratings_combined_test <- data_test[c("Pre_Flight_and_WiFi_Ratings","In_Flight_and_Baggage_Ratings")]
```
```{r linear_model_v2_test,echo=FALSE}
data_test$predicted_probabilities_linear <- predict(linear_model_v2, newdata = data_test)
data_test$predicted_outcome_linear <- ifelse(data_test$predicted_probabilities_linear > 0.5, 1, 0)
confusion_matrix <- table(data_test$satisfaction, data_test$predicted_outcome_linear)
print(confusion_matrix)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
# print(paste("Accuracy:", round(accuracy, 3)))
```
- **Accuracy**: `r accuracy`
However, it is not yet clear that a linear model would be the best predictor available. **Logistic regression**, which predicts the log odds of satisfaction. is the dominant approach for modeling binary variables (Allison, 2015). Logistic regression models utilize different assumptions relative to linear models, significantly altering the necessary EDA steps. Rather than a linear relationship between parameters and the dependent variable, logistic regression assumes a linear relationship between parameters and the log odds. Independence of errors and multicollinearity remain as assumptions for both linear and logistic models. Homoskedasticity and normally distributed residuals are both not required under logistic regression ("Assumptions of Logistic Regression", n.d.).
Unlike a standard linear regression, which assumes that independent parameters have a linear relationship with the dependent variable, **logistic regression assumes that parameters have a linear relationship with the log odds** ("Assumptions of Logistic Regression", n.d.).
Odds represent the number of favorable outcomes divided by the number of unfavorable outcomes. Put differently, if "p" represents the probability of favorable outcomes, Odds = **p/(1-p)**. Log odds take the natural log of the odds, which can be expressed as **ln(p/1-p))** (Agarwal, 2019). We used visual test to examine whether or not this assumption holds true for continuous variables. While it is not sensible to compute log odds for individual data points, we grouped continuous variables into discrete buckets—calculating the average log odds for each—to examine whether or not they might satisfy this assumption.
Only flight distance, as well as in-flight and baggage ratings, displayed roughly linear relationships with log odds of satisfaction in our testing. Age appeared to have a parabolic relationship, peaking in the middle, indicating some sort of aggressive transformation method may be necessary to reach a linear relationship. Meanwhile, log odds for both delay statistics quickly dispersed in both directions as they increase (likely in part due to the limited frequency of higher durations), making it difficult to conclude with certainty that a linear relationship exists. Pre-flight and wi-fi ratings appear to have a significantly looser connection relative to in-flight ratings with a potential dip in log odds for average ratings.
# Building and testing a logit model
## Testing linearity with log odds
```{r discrete, fig.width = 15, fig.height = 15, fig.align = 'center', echo=FALSE}
# Calculate the discrete buckets for each variable
age_breaks <- seq(0, 90, by = 2)
dist_breaks <- seq(0, 5000, by = 20)
delay_breaks <- seq(0, 1750, by = 10)
rating_breaks <- seq(0,5,by=0.1)
# Use cut() to create breaks
data$AgeCategory <- cut(data$Age, breaks = age_breaks)
data$DistCategory <- cut(data$Flight.Distance, breaks = dist_breaks)
data$DepDelayCategory<- cut(data$Departure.Delay.in.Minutes, breaks = delay_breaks)
data$ArrDelayCategory<- cut(data$Arrival.Delay.in.Minutes, breaks = delay_breaks)
data$PreFlightCategory<-cut(data$Pre_Flight_and_WiFi_Ratings, breaks = rating_breaks)
data$InFlightCategory<-cut(data$In_Flight_and_Baggage_Ratings, breaks = rating_breaks)
```
```{r log_odds, echo=FALSE}
# Define a function to calculate the log odds for a given x value, to use with aggregate()
log_odds_calc <- function(x) {
avg <- mean(x)
log_odds <- log(avg / (1 - avg))
return(log_odds)
}
# Use aggregate with the log odds function for each continuous variable of interest (with their discrete grouping)
log_odds_Age <- aggregate(satisfaction ~ AgeCategory, data, log_odds_calc)
log_odds_Dist <- aggregate(satisfaction ~ DistCategory, data, log_odds_calc)
log_odds_DepDelay <- aggregate(satisfaction ~ DepDelayCategory, data, log_odds_calc)
log_odds_ArrDelay <- aggregate(satisfaction ~ ArrDelayCategory, data, log_odds_calc)
log_odds_PreFlight <- aggregate(satisfaction ~ PreFlightCategory, data, log_odds_calc)
log_odds_InFlight <- aggregate(satisfaction ~ InFlightCategory, data, log_odds_calc)
```
```{r log_odds_cont_visual, fig.width = 15, fig.height = 15, fig.align = 'center', echo=FALSE}
# Create scatter plots for "original" continuous variables
ggplot(log_odds_Age, aes(x = AgeCategory, y = satisfaction)) +
geom_point(size=6, color="turquoise") +
labs(
x = "\nAge",
y = "Log Odds\n",
title = "\n\n\nLog Odds by Age\n"
) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_blank(),
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 32, face = "bold")
)
ggplot(log_odds_Dist, aes(x = DistCategory, y = satisfaction)) +
geom_point(size=6, color="blue") +
labs(
x = "\nDistance",
y = "Log Odds\n",
title = "\n\n\nLog Odds by Distance\n"
) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_blank(),
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 32, face = "bold")
)
ggplot(log_odds_DepDelay, aes(x = DepDelayCategory, y = satisfaction)) +
geom_point(size=6, color="purple") +
labs(
x = "\nDeparture Delay",
y = "Log Odds\n",
title = "\n\n\nLog Odds by Departure Delay\n"
) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_blank(),
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 32, face = "bold")
)
ggplot(log_odds_ArrDelay, aes(x = ArrDelayCategory, y = satisfaction)) +
geom_point(size=6, color="pink") +
labs(
x = "\nArrival Delay",
y = "Log Odds\n",
title = "\n\n\nLog Odds by Arrival Delay\n"
) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_blank(),
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 32, face = "bold")
)
```
```{r log_odds_ratings_visual, fig.width = 15, fig.height = 15, fig.align = 'center', echo=FALSE}
# Create scatter plots for aggregate ratings variables
ggplot(log_odds_PreFlight, aes(x = PreFlightCategory, y = satisfaction)) +
geom_point(size=6, color="red") +
labs(
x = "\nPre-Flight & Wifi Satisfaction Rating",
y = "Log Odds\n",
title = "\n\n\nLog Odds by Pre-Flight & Wifi Ratings\n"
) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_blank(),
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 32, face = "bold")
)
ggplot(log_odds_InFlight, aes(x = InFlightCategory, y = satisfaction)) +
geom_point(size=6, color="orange") +
labs(
x = "\nIn-Flight & Baggage Satisfaction Rating",
y = "Log Odds\n",
title = "\n\n\nLog Odds by In-Flight and Baggage Ratings\n"
) +
theme(
panel.background = element_rect(fill = "white"),
axis.text.x = element_blank(),
axis.text = element_text(size = 20), # Adjust the size of text elements
axis.title = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20),
legend.key.size = unit(2, "lines"),
plot.title = element_text(size = 32, face = "bold")
)
```
Following visual testing, we generated a logit model in order to examine potential differences relative to the prior linear model. Rather than starting with a pared-down variable list, we returned to an expanded variable list to see if there were any distinctions in what the models deemed statistically significant. This proved to be informative; alongside gender and age, flight distance also failed to reach the threshold for statistical significance.
```{r logit_model}
logit_model = glm(satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel + Class + Flight.Distance + Pre_Flight_and_WiFi_Ratings + In_Flight_and_Baggage_Ratings, data = data, family = "binomial")
summary(logit_model)
```
In order to compare this with the linear model, we generated another confusion matrix based on the testing data. In a similar fashion to the linear model, we created a "v2" model removing statistically insignificant inputs. The accuracy results were better than those of the linear model, but only slightly; it isn't clear whether this marginal improvement would hold true given further testing with different survey data. The calculated McFadden pseudo-R^2 falls above 0.5.
```{r logit_model_v2,echo=FALSE}
logit_model_v2 = glm(satisfaction ~ Customer.Type + Type.of.Travel + Class + Pre_Flight_and_WiFi_Ratings + In_Flight_and_Baggage_Ratings, data = data, family = "binomial")
summary(logit_model_v2)
```
```{r logit_model_v2_test,echo=FALSE}
data_test$predicted_probabilities_logit <- predict(logit_model_v2, newdata = data_test)
data_test$predicted_outcome_logit <- ifelse(data_test$predicted_probabilities_logit > 0.5, 1, 0)
confusion_matrix <- table(data_test$satisfaction, data_test$predicted_outcome_logit)
print(confusion_matrix)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", round(accuracy, 3)))
logit_model_null <- glm(satisfaction ~ 1, data = data, family = "binomial")
mcFadden <- 1 - logLik(logit_model_v2)/logLik(logit_model_null)
print(paste("McFadden R^2:", round(mcFadden,3)))
```
## Constructing the initial logit model
```{r,max.height = "500px", echo=FALSE}
log_model <- glm(satisfaction ~ Age + Type.of.Travel + Class + Inflight.wifi.service +
Ease.of.Online.booking + Online.boarding + Seat.comfort +
Inflight.entertainment + On.board.service + Leg.room.service +
Baggage.handling + Checkin.service + Inflight.service +
Cleanliness + Arrival.Delay.in.Minutes,
family = binomial(), data = data)
summary(log_model)
log_predictions <- predict(log_model, newdata = data_test, type = "response")
```
#### Observations
Our initial model yields some interesting observations. For one, most variables have p-values less than 0.05, indicating they significantly influence the dependent variable. Positive coefficients (e.g., 'Age', 'Type of Travel') suggest a positive relationship with the outcome, whereas negative coefficients (e.g., 'Ease of Online booking', 'Arrival Delay in Minutes') indicate a negative relationship. Variables with larger coefficients and small standard errors, like 'Online boarding' and 'Type of Travel', may have a more substantial impact on the outcome. The large difference between the null and residual deviance suggests a good model fit. Some variables, like 'Inflight service', do not show statistical significance, implying a weaker or no influence on the dependent variable. Finally, the model seems capable of predicting the outcome effectively, given the significance and size of most coefficients.
```{r, echo=FALSE}
# Converting probabilities to binary classification based on a threshold (e.g., 0.5)
log_pred_class <- ifelse(log_predictions > 0.5, 1, 0)
conf_matrix <- table(data_test$satisfaction, log_pred_class)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2,2] / sum(conf_matrix[2,])
recall <- conf_matrix[2,2] / sum(conf_matrix[,2])
f_measure <- 2 * precision * recall / (precision + recall)
specificity <- conf_matrix[1,1] / sum(conf_matrix[1,])
log_pred_roc <- pROC::roc(data_test$satisfaction, log_predictions)
auc_value <- pROC::auc(log_pred_roc)
# list(accuracy = accuracy, precision = precision, recall = recall,
# f_measure = f_measure, specificity = specificity, AUC = auc_value)
print(conf_matrix)
```
```{r, echo=FALSE}
vif_model <- vif(log_model)
print(vif_model)
```
```{r ROC-LO, echo=FALSE}
plot(log_pred_roc,
main = "ROC Curve for Logistic Regression Model",
col = "#1c61b6",
lwd = 2)
auc(log_pred_roc)
text(0.6, 0.2, paste("AUC =", round(auc(log_pred_roc), 2)), col = "red")
```
The model's performance was evaluated using various metrics:
- **Accuracy**: `r accuracy` (The proportion of true results among the total number of cases)
- **Precision**: `r precision` (The proportion of true positives among all positive predictions)
- **Recall**: `r recall` (The proportion of true positives among all actual positives)
- **F1 Score**: `r f_measure` (The harmonic mean of precision and recall)
- **Specificity**: `r specificity` (The proportion of true negatives among all actual negatives)
The accuracy is quite high, at 87.9%. This means that the model correctly predicts whether a customer is satisfied or not in approximately 88 out of 100 cases. It's a good indicator of overall performance, but it's important to consider other metrics as well, especially if the data set is imbalanced.
Both precision and recall are also high, around 86%. Precision indicates that when the model predicts customer satisfaction, it is correct 85.9% of the time. Recall tells us that the model successfully identifies 86% of actual satisfied customers. These metrics are particularly important in scenarios where the costs of false positives and false negatives are different.
The F-Measure, which balances precision and recall, is also 0.86. This suggests a good balance between precision and recall in the model, which is crucial for a well-rounded predictive performance.
The specificity is 89.4%, indicating that the model is quite good at identifying true negatives - i.e., it correctly identifies customers who are not satisfied.
The AUC value is 0.948, which is very close to 1. This high value indicates that the model has an excellent ability to discriminate between satisfied and unsatisfied customers. It implies that the model has a high true positive rate and a low false positive rate.
Overall, the model exhibits strong predictive capabilities across various metrics, indicating that it is well-tuned for this particular task. However, it's always important to consider the context and the potential impact of misclassifications. Also, examining other aspects like model interpretability, feature importance, and the performance on different segments of the data can provide deeper insights.
VIF results are also generally good, indicating that for most of the model's predictors, multicollinearity is not a significant issue.
The ROC curve displayed is highly indicative of an excellent predictive model, with an AUC (Area Under the Curve) of 0.95, showing exceptional discrimination ability between the positive and negative classes. The curve stays well above the diagonal line of no-discrimination, signaling strong performance.
# Decision Tree
## Data Preparation
### Data Type Conversion
- Certain columns in both training and testing datasets are converted to factors to reflect their ordinal nature.
**Column Datatype Changes - Testing Data**: Conversion of certain columns to factors based on their ordinal nature.
```{r Data Prep-1}
data_test$Inflight.wifi.service = as.factor(data_test$Inflight.wifi.service)
data_test$Departure.Arrival.time.convenient = as.factor(data_test$Departure.Arrival.time.convenient)
data_test$Ease.of.Online.booking = as.factor(data_test$Ease.of.Online.booking)
data_test$Gate.location = as.factor(data_test$Gate.location)
data_test$Food.and.drink = as.factor(data_test$Food.and.drink)
data_test$Online.boarding = as.factor(data_test$Online.boarding)
data_test$Seat.comfort = as.factor(data_test$Seat.comfort)
data_test$Inflight.entertainment = as.factor(data_test$Inflight.entertainment)
data_test$On.board.service = as.factor(data_test$On.board.service)
data_test$Leg.room.service = as.factor(data_test$Leg.room.service)
data_test$Baggage.handling = as.factor(data_test$Baggage.handling)
data_test$Checkin.service = as.factor(data_test$Checkin.service)
data_test$Inflight.service = as.factor(data_test$Inflight.service)
data_test$Cleanliness = as.factor(data_test$Cleanliness)
```
**Column Datatype Changes - Training Data**: Similar data type conversions for training data.
```{r Data-Prep-2}
#Column datatype Changes - Training Data - As Columns has ordinal its better to convert into factor
data$Inflight.wifi.service = as.factor(data$Inflight.wifi.service)
data$Departure.Arrival.time.convenient = as.factor(data$Departure.Arrival.time.convenient)
data$Ease.of.Online.booking = as.factor(data$Ease.of.Online.booking)
data$Gate.location = as.factor(data$Gate.location)
data$Food.and.drink = as.factor(data$Food.and.drink)
data$Online.boarding = as.factor(data$Online.boarding)
data$Seat.comfort = as.factor(data$Seat.comfort)
data$Inflight.entertainment = as.factor(data$Inflight.entertainment)
data$On.board.service = as.factor(data$On.board.service)
data$Leg.room.service = as.factor(data$Leg.room.service)
data$Baggage.handling = as.factor(data$Baggage.handling)
data$Checkin.service = as.factor(data$Checkin.service)
data$Inflight.service = as.factor(data$Inflight.service)
data$Cleanliness = as.factor(data$Cleanliness)
```
## Decision Tree Model Building
1. **Initial Model Building**: A decision tree (`tree`) is constructed using various predictors such as customer demographics, service ratings, and flight details.
```{r Init Model, echo=FALSE}
tree = rpart(satisfaction ~ Gender + Customer.Type + Age +
Type.of.Travel + Class + Flight.Distance + Inflight.wifi.service +
Departure.Arrival.time.convenient + Ease.of.Online.booking +
Gate.location + Food.and.drink + Online.boarding + Seat.comfort +
Inflight.entertainment + On.board.service + Leg.room.service +
Baggage.handling + Checkin.service + Inflight.service +
Cleanliness + Departure.Delay.in.Minutes + Arrival.Delay.in.Minutes ,
data = data, method = 'class', minbucket=25)
```