-
Notifications
You must be signed in to change notification settings - Fork 0
/
EDA_titanic.R
81 lines (56 loc) · 2.21 KB
/
EDA_titanic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(tidyverse)
library(randomForest)
library(ggthemes)
######################################################
# Reading in the data
######################################################
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
glimpse(train)
summary(train)
sum(is.na(train))
# Bind together the two datasets
full <- bind_rows(train %>% mutate(dset = "train"),
test %>% mutate(dset = "test")
)
full <- full %>% mutate(dset = factor(dset))
str(full)
######################################################
# Feature engineering
######################################################
full$Title <- gsub('(.*, )|(\\..*)', '', full$Name)
# Show title counts by sex
table(full$Sex, full$Title)
# Titles with very low cell counts to be combined to "rare" level
rare_title <- c('Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don',
'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer')
# Also reassign mlle, ms, and mme accordingly
full$Title[full$Title == 'Mlle'] <- 'Miss'
full$Title[full$Title == 'Ms'] <- 'Miss'
full$Title[full$Title == 'Mme'] <- 'Mrs'
full$Title[full$Title %in% rare_title] <- 'Rare Title'
table(full$Sex, full$Title)
# Finally, grab surname from passenger name
full$Surname <- sapply(full$Name, function(x) strsplit(x, split = '[,.]')[[1]][1])
# Check how many distinct surnames we have
nlevels(factor(full$Surname))
# Create a family size variable including the passenger themselves
full$Fsize <- full$SibSp + full$Parch + 1
table(full$Fsize)
# Create a family variable
full$Family <- paste(full$Surname, full$Fsize, sep='_')
# Family size vs. survival
ggplot(full[(full$dset=="train"),], aes(x = Fsize, fill = factor(Survived))) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Family Size', y = 'Count') +
theme_few()
head(full$Cabin, 30)
#TODO
strsplit(full$Cabin[2],NULL)
######################################################
# Missing values imputation
######################################################
######################################################
# Modelling
######################################################