The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.
One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.
In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.
Data came from Kaggle.com as a part of the online competition.
Load libraries
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(dplyr)
library(stringr) #strings
library(zoo) #na.aggregrate
library(modeest) #mode
library(ROCR) #ROC curve
library(caret)
library(class)
library(gmodels)
library(randomForest)
Import the data frames that Kaggle provided.
#import data frames
path <- "C:/Users/Cat/Google Drive/Data Analysis/Kaggle Competitions/Titanic - Machine Learning/"
df_train <- read_csv(file=str_c(path, "train.csv"))
df_test <- read_csv(file=str_c(path, "test.csv"))
Review training data to understand it.
Find missing values.
#review data frame
df_train %>% glimpse()
## Observations: 891
## Variables: 12
## $ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex <chr> "male", "female", "female", "female", "male", "mal...
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...
df_test %>% glimpse()
## Observations: 418
## Variables: 11
## $ PassengerId <dbl> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, ...
## $ Pclass <dbl> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 2,...
## $ Name <chr> "Kelly, Mr. James", "Wilkes, Mrs. James (Ellen Nee...
## $ Sex <chr> "male", "female", "male", "male", "female", "male"...
## $ Age <dbl> 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0, 18...
## $ SibSp <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0,...
## $ Parch <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Ticket <chr> "330911", "363272", "240276", "315154", "3101298",...
## $ Fare <dbl> 7.8292, 7.0000, 9.6875, 8.6625, 12.2875, 9.2250, 7...
## $ Cabin <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "B...
## $ Embarked <chr> "Q", "S", "Q", "S", "S", "S", "Q", "S", "C", "S", ...
df_train$Survived %>% table()
## .
## 0 1
## 549 342
sapply(df_train, function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
sapply(df_train, function(x) length(unique(x)))
## PassengerId Survived Pclass Name Sex Age
## 891 2 3 891 2 89
## SibSp Parch Ticket Fare Cabin Embarked
## 7 7 681 248 148 4
Amelia::missmap(df_train, main = "Missing values vs observed")
Combine data for cleaning so that the same manipulations are applied to everything.
#combine data for cleaning
df <- bind_rows(df_train, df_test)
df %>% glimpse()
## Observations: 1,309
## Variables: 12
## $ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex <chr> "male", "female", "female", "female", "male", "mal...
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...
Perform data cleaning:
#clean data
df <- df %>% mutate_if(is.character, ~replace(., is.na(.), "NA"))
df <- df %>% mutate(PassengerId = as.integer(PassengerId),
Survived = as.logical(Survived),
Pclass = as.ordered(Pclass),
Title = str_extract(Name, "(?<=,\\s)[:alpha:]+\\s?[:alpha:]*(?=.)") %>% as.factor(),
Group = case_when(Title %in% c("Capt","Col","Major","Dr","Mr","Rev") ~ "Officer",
Title %in% c("the Countess","Lady","Sir","Don","Dona","Jonkheer") ~ "Royalty",
Title %in% c("Miss","Mlle","Ms") ~ "Miss",
Title %in% c("Mme","Mrs") ~ "Mrs",
Title == "Master" ~ "Master") %>% as.factor(),
Sex = as.factor(Sex),
Family = as.ordered(SibSp + Parch),
SibSp = as.ordered(SibSp),
ParCh = as.ordered(Parch),
Fare = na.aggregate(Fare,Pclass),
Cabin = as.factor(Cabin),
Embarked = replace(Embarked, is.na(Embarked), mfv(Embarked, na.rm=T)) %>% as.factor()) %>%
select(-Name,-Parch,-Ticket)
sapply(df, function(x) sum(is.na(x)))
## PassengerId Survived Pclass Sex Age SibSp
## 0 418 0 0 263 0
## Fare Cabin Embarked Title Group Family
## 0 0 0 0 0 0
## ParCh
## 0
df %>% glimpse()
## Observations: 1,309
## Variables: 13
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALS...
## $ Pclass <ord> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Sex <fct> male, female, female, female, male, male, male, ma...
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp <ord> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin <fct> NA, C85, NA, C123, NA, NA, E46, NA, NA, NA, G6, C1...
## $ Embarked <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q,...
## $ Title <fct> Mr, Mrs, Miss, Mrs, Mr, Mr, Mr, Master, Mrs, Mrs, ...
## $ Group <fct> Officer, Mrs, Miss, Mrs, Officer, Officer, Officer...
## $ Family <ord> 1, 1, 0, 1, 0, 0, 0, 4, 2, 1, 2, 0, 0, 6, 0, 0, 5,...
## $ ParCh <ord> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
Determine which features can predict age in order to provide better estimates for missing values.
##create a model to see which features can estimate age
age1 <- lm(Age ~ Pclass+Group, data=df)
age1 %>% summary()
##
## Call:
## lm(formula = Age ~ Pclass + Group, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.137 -7.866 -0.866 6.477 45.134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9721 1.5535 5.776 1.01e-08 ***
## Pclass.L -9.0298 0.6006 -15.035 < 2e-16 ***
## Pclass.Q 2.3736 0.6589 3.603 0.00033 ***
## GroupMiss 13.8110 1.7196 8.032 2.59e-15 ***
## GroupMrs 27.1856 1.7786 15.285 < 2e-16 ***
## GroupOfficer 25.3102 1.6029 15.790 < 2e-16 ***
## GroupRoyalty 24.8406 4.8556 5.116 3.72e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.15 on 1039 degrees of freedom
## (263 observations deleted due to missingness)
## Multiple R-squared: 0.4053, Adjusted R-squared: 0.4019
## F-statistic: 118 on 6 and 1039 DF, p-value: < 2.2e-16
age1 %>% anova(test="Chisq")
## Analysis of Variance Table
##
## Response: Age
## Df Sum Sq Mean Sq F value Pr(>F)
## Pclass 2 37339 18669.7 150.26 < 2.2e-16 ***
## Group 4 50660 12664.9 101.93 < 2.2e-16 ***
## Residuals 1039 129098 124.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df %>% group_by(Pclass,Group) %>% summarize(Age = Age %>% mean(na.rm=TRUE))
## # A tibble: 13 x 3
## # Groups: Pclass [3]
## Pclass Group Age
## <ord> <fct> <dbl>
## 1 1 Master 6.98
## 2 1 Miss 30.1
## 3 1 Mrs 42.9
## 4 1 Officer 42.2
## 5 1 Royalty 41.2
## 6 2 Master 2.76
## 7 2 Miss 20.9
## 8 2 Mrs 33.5
## 9 2 Officer 32.9
## 10 3 Master 6.09
## 11 3 Miss 17.4
## 12 3 Mrs 32.3
## 13 3 Officer 28.3
Add in missing values for Age using mean, grouped by Pclass and Group.
Create a more general feature for Child.
#estimate age and create child feature
df <- df %>% mutate(Age = na.aggregate(Age,c(Pclass,Group)),
Child = case_when(Age < 18 ~ 1,
Age >= 18 ~ 0) %>% as.logical())
## Warning in split.default(x, g): data length is not a multiple of split
## variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is
## not a multiple of split variable
sapply(df, function(x) sum(is.na(x)))
## PassengerId Survived Pclass Sex Age SibSp
## 0 418 0 0 0 0
## Fare Cabin Embarked Title Group Family
## 0 0 0 0 0 0
## ParCh Child
## 0 0
Perform more manipulations on Cabin to figure out if the letter or number has any significance.
#df_cabin
df_cabin <- str_split(df$Cabin, " ")
df_cabin <- data.frame(PassengerId = rep(df$PassengerId, sapply(df_cabin, length)),
Cabin = unlist(df_cabin))
df_cabin <- df_cabin %>% mutate(CabinLtr = str_extract(Cabin, "^[:alpha:]*"),
CabinLtr = as.factor(CabinLtr)) %>%
select(PassengerId, CabinLtr)
df_cabin %>% glimpse() %>% summary()
## Observations: 1,370
## Variables: 2
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ CabinLtr <fct> NA, C, NA, C, NA, NA, E, NA, NA, NA, G, C, NA, NA,...
## PassengerId CabinLtr
## Min. : 1.0 NA :1014
## 1st Qu.: 329.2 C : 114
## Median : 664.5 B : 96
## Mean : 657.9 D : 48
## 3rd Qu.: 981.8 E : 45
## Max. :1309.0 A : 22
## (Other): 31
Put Cabin dataframe back into main dataframe.
#join dataframes back together
df <- df %>% inner_join(df_cabin, by="PassengerId") %>% select(-Cabin)
df %>% glimpse()
## Observations: 1,370
## Variables: 14
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALS...
## $ Pclass <ord> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Sex <fct> male, female, female, female, male, male, male, ma...
## $ Age <dbl> 22.00000, 38.00000, 26.00000, 35.00000, 35.00000, ...
## $ SibSp <ord> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Embarked <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q,...
## $ Title <fct> Mr, Mrs, Miss, Mrs, Mr, Mr, Mr, Master, Mrs, Mrs, ...
## $ Group <fct> Officer, Mrs, Miss, Mrs, Officer, Officer, Officer...
## $ Family <ord> 1, 1, 0, 1, 0, 0, 0, 4, 2, 1, 2, 0, 0, 6, 0, 0, 5,...
## $ ParCh <ord> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Child <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, T...
## $ CabinLtr <fct> NA, C, NA, C, NA, NA, E, NA, NA, NA, G, C, NA, NA,...
Remove any duplicate rows that could skew the model.
# Remove duplicate rows
df <- df %>% distinct()
Create variables to use in plots in order to evaluate potential trends in the data.
#create variables to use in functions
y_col <- "Survived"
x_cols <- df %>% names()
x_cols <- x_cols[!str_detect(x_cols, y_col)]
Use boxplots to look for trends with numerical features.
There are noticeable trends with both Age and Fare.
#look for basic trends (boxplot)
gg_boxplot <- function(x_col, data){
if(is.numeric(data[[x_col]])){
plt <- data %>%
ggplot(aes_string(x='Survived', y=x_col)) +
geom_boxplot()+
labs(title=str_c("Survived vs. ", x_col))
plt %>% print()
}
}
x_cols %>% walk(gg_boxplot, df)
Use boxplots to look for trends with categorical features.
There are noticeable trends with Pclass, Sex, SibSp, Embarked, Title, Group, Family, ParCh, and Child.
#look for basic trends (scatter plot)
gg_scatter <- function(data, x_col, y_col) {
if(!is.numeric(data[[x_col]])){
plt <- data %>% ggplot(mapping=aes_string(x_col, y_col))+
geom_jitter(alpha=0.5)+
coord_flip()+
labs(title=str_c("Survived: ", x_col, " by ", y_col), subtitle="Points jittered and alpha blended")
plt %>% print()
}
}
x_cols %>% walk(gg_scatter, data=df, y_col=y_col)
Normalize the data frame so that model is not skewed by different types of measurements.
#normalize dataframes
normalize <- function(x)(x - mean(x, na.rm=T))/sd(x, na.rm=T)
df_norm <- df %>% mutate_if(is.double, normalize)
df_norm %>% glimpse()
## Observations: 1,316
## Variables: 14
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALS...
## $ Pclass <ord> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Sex <fct> male, female, female, female, male, male, male, ma...
## $ Age <dbl> -0.55963667, 0.66141600, -0.25437350, 0.43246862, ...
## $ SibSp <ord> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Fare <dbl> -0.50169283, 0.73844524, -0.48862006, 0.38628780, ...
## $ Embarked <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q,...
## $ Title <fct> Mr, Mrs, Miss, Mrs, Mr, Mr, Mr, Master, Mrs, Mrs, ...
## $ Group <fct> Officer, Mrs, Miss, Mrs, Officer, Officer, Officer...
## $ Family <ord> 1, 1, 0, 1, 0, 0, 0, 4, 2, 1, 2, 0, 0, 6, 0, 0, 5,...
## $ ParCh <ord> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Child <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, T...
## $ CabinLtr <fct> NA, C, NA, C, NA, NA, E, NA, NA, NA, G, C, NA, NA,...
Recreate original testing dataframe as well as a training and validation set.
#break training set into training, validation and test sets
df_test_norm <- df_norm %>% filter(is.na(Survived))
set.seed(123)
df_train_norm <- df_norm %>% filter(!is.na(Survived)) %>% sample_frac(0.8)
df_val_norm <- df_norm %>% filter(!is.na(Survived)) %>% setdiff(df_train_norm)
df_test_norm %>% glimpse()
## Observations: 421
## Variables: 14
## $ PassengerId <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, ...
## $ Survived <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Pclass <ord> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 2,...
## $ Sex <fct> male, female, male, male, female, male, female, ma...
## $ Age <dbl> 0.39431073, 1.34825812, 2.49299500, -0.17805771, -...
## $ SibSp <ord> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0,...
## $ Fare <dbl> -0.49047542, -0.50653461, -0.45448558, -0.47433683...
## $ Embarked <fct> Q, S, Q, S, S, S, Q, S, C, S, S, S, S, S, S, C, Q,...
## $ Title <fct> Mr, Mrs, Mr, Mr, Mrs, Mr, Miss, Mr, Mrs, Mr, Mr, M...
## $ Group <fct> Officer, Mrs, Officer, Officer, Mrs, Officer, Miss...
## $ Family <ord> 0, 1, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 1, 1, 1, 1, 0,...
## $ ParCh <ord> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Child <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FA...
## $ CabinLtr <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, B,...
df_train_norm %>% glimpse()
## Observations: 716
## Variables: 14
## $ PassengerId <int> 256, 702, 364, 784, 834, 41, 468, 789, 488, 403, 8...
## $ Survived <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRU...
## $ Pclass <ord> 3, 1, 3, 3, 3, 3, 1, 3, 1, 3, 1, 2, 2, 3, 3, 3, 3,...
## $ Sex <fct> female, male, male, male, male, female, male, male...
## $ Age <dbl> -0.02542613, 0.43246862, 0.43246862, -0.34470337, ...
## $ SibSp <ord> 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 4, 1,...
## $ Fare <dbl> -0.346837533, -0.132992041, -0.505566251, -0.18794...
## $ Embarked <fct> C, S, S, S, S, S, S, S, C, S, C, S, S, S, S, Q, Q,...
## $ Title <fct> Mrs, Mr, Mr, Mr, Mr, Mrs, Mr, Master, Mr, Miss, Mi...
## $ Group <fct> Mrs, Officer, Officer, Officer, Officer, Mrs, Offi...
## $ Family <ord> 2, 0, 0, 3, 0, 1, 0, 3, 0, 1, 0, 0, 0, 0, 0, 5, 1,...
## $ ParCh <ord> 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ Child <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, T...
## $ CabinLtr <fct> NA, E, NA, NA, NA, NA, NA, NA, B, NA, NA, NA, NA, ...
df_val_norm %>% glimpse()
## Observations: 179
## Variables: 14
## $ PassengerId <int> 2, 3, 11, 17, 21, 25, 30, 31, 38, 42, 52, 54, 56, ...
## $ Survived <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ Pclass <ord> 1, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 3, 2,...
## $ Sex <fct> female, female, female, male, male, female, male, ...
## $ Age <dbl> 0.66141600, -0.25437350, -1.93332092, -2.08595250,...
## $ SibSp <ord> 1, 0, 1, 4, 0, 3, 0, 0, 0, 1, 0, 1, 0, 0, 0, 4, 0,...
## $ Fare <dbl> 0.73844524, -0.48862006, -0.31867392, -0.07803795,...
## $ Embarked <fct> C, S, S, Q, S, S, S, C, S, S, S, S, S, S, C, S, S,...
## $ Title <fct> Mrs, Miss, Miss, Master, Mr, Miss, Mr, Don, Mr, Mr...
## $ Group <fct> Mrs, Miss, Miss, Master, Officer, Miss, Officer, R...
## $ Family <ord> 1, 0, 2, 5, 0, 4, 0, 0, 0, 1, 0, 1, 0, 0, 0, 6, 0,...
## $ ParCh <ord> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,...
## $ Child <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALS...
## $ CabinLtr <fct> C, NA, G, NA, NA, NA, NA, NA, NA, NA, NA, NA, C, N...
Time to build some models.
The models below are statistically significant. They do not show the process of elimating features that were not statistically significant for determining the label.
The best model will be determined later.
#build statistical models
#mod1
mod1 <- glm(Survived ~ Group+Pclass+Embarked, family=binomial(), data=df_train_norm)
mod1 %>% summary()
##
## Call:
## glm(formula = Survived ~ Group + Pclass + Embarked, family = binomial(),
## data = df_train_norm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3339 -0.6269 -0.3609 0.5805 2.3511
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.39029 0.44247 3.142 0.00168 **
## GroupMiss 0.20742 0.42454 0.489 0.62514
## GroupMrs 0.82312 0.46665 1.764 0.07775 .
## GroupOfficer -2.39966 0.41197 -5.825 5.72e-09 ***
## GroupRoyalty -1.00648 1.24515 -0.808 0.41891
## Pclass.L -1.50732 0.18999 -7.933 2.13e-15 ***
## Pclass.Q -0.08629 0.20371 -0.424 0.67184
## EmbarkedNA 11.65444 617.33076 0.019 0.98494
## EmbarkedQ 0.17578 0.40312 0.436 0.66281
## EmbarkedS -0.58830 0.26137 -2.251 0.02440 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 953.75 on 715 degrees of freedom
## Residual deviance: 625.83 on 706 degrees of freedom
## AIC: 645.83
##
## Number of Fisher Scoring iterations: 13
anova(mod1, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 715 953.75
## Group 4 238.277 711 715.47 < 2e-16 ***
## Pclass 2 80.704 709 634.77 < 2e-16 ***
## Embarked 3 8.939 706 625.83 0.03012 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#mod2
mod2 <- glm(Survived ~ Pclass+Age+Group -1, family=binomial(), data=df_train_norm)
mod2 %>% summary()
##
## Call:
## glm(formula = Survived ~ Pclass + Age + Group - 1, family = binomial(),
## data = df_train_norm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5581 -0.6440 -0.4078 0.5945 2.3992
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## Pclass1 1.8267 0.4556 4.009 6.09e-05 ***
## Pclass2 0.5329 0.4711 1.131 0.2580
## Pclass3 -0.5648 0.4445 -1.271 0.2038
## Age -0.2518 0.1282 -1.964 0.0495 *
## GroupMiss 0.5892 0.4466 1.320 0.1870
## GroupMrs 1.3232 0.5320 2.487 0.0129 *
## GroupOfficer -1.9543 0.4636 -4.215 2.50e-05 ***
## GroupRoyalty -0.4804 1.2593 -0.382 0.7028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 992.59 on 716 degrees of freedom
## Residual deviance: 630.79 on 708 degrees of freedom
## AIC: 646.79
##
## Number of Fisher Scoring iterations: 5
anova(mod2, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 716 992.59
## Pclass 3 125.011 713 867.58 < 2.2e-16 ***
## Age 1 23.638 712 843.94 1.162e-06 ***
## Group 4 213.146 708 630.79 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#mod3
mod3 <- glm(Survived ~ Pclass+Sex+Age, family=binomial(), data=df_train_norm)
mod3 %>% summary()
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age, family = binomial(),
## data = df_train_norm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6659 -0.6460 -0.4510 0.6444 2.4025
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4526 0.1699 8.551 < 2e-16 ***
## Pclass.L -1.7074 0.1985 -8.603 < 2e-16 ***
## Pclass.Q 0.0245 0.1910 0.128 0.897927
## Sexmale -2.5939 0.2080 -12.472 < 2e-16 ***
## Age -0.4097 0.1114 -3.679 0.000235 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 953.75 on 715 degrees of freedom
## Residual deviance: 652.93 on 711 degrees of freedom
## AIC: 662.93
##
## Number of Fisher Scoring iterations: 4
anova(mod3, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 715 953.75
## Pclass 2 86.172 713 867.58 < 2.2e-16 ***
## Sex 1 200.334 712 667.24 < 2.2e-16 ***
## Age 1 14.313 711 652.93 0.0001548 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#mod4
mod4 <- glm(Survived ~ Pclass+Age+Embarked+Group, family=binomial(), data=df_train_norm)
mod4 %>% summary()
##
## Call:
## glm(formula = Survived ~ Pclass + Age + Embarked + Group, family = binomial(),
## data = df_train_norm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4914 -0.6379 -0.3715 0.6044 2.4851
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.00441 0.48544 2.069 0.0385 *
## Pclass.L -1.70017 0.21616 -7.865 3.68e-15 ***
## Pclass.Q -0.04169 0.20637 -0.202 0.8399
## Age -0.26122 0.12938 -2.019 0.0435 *
## EmbarkedNA 11.89108 621.42681 0.019 0.9847
## EmbarkedQ 0.27161 0.40551 0.670 0.5030
## EmbarkedS -0.55441 0.26381 -2.102 0.0356 *
## GroupMiss 0.47689 0.45112 1.057 0.2905
## GroupMrs 1.33616 0.53828 2.482 0.0131 *
## GroupOfficer -1.95975 0.46753 -4.192 2.77e-05 ***
## GroupRoyalty -0.55550 1.26100 -0.441 0.6596
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 953.75 on 715 degrees of freedom
## Residual deviance: 621.63 on 705 degrees of freedom
## AIC: 643.63
##
## Number of Fisher Scoring iterations: 13
anova(mod4, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 715 953.75
## Pclass 2 86.172 713 867.58 < 2.2e-16 ***
## Age 1 23.638 712 843.94 1.162e-06 ***
## Embarked 3 20.828 709 823.11 0.0001143 ***
## Group 4 201.484 705 621.63 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
View models side by side to compare them to each other.
mod1 %>% broom::glance()
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 954. 715 -313. 646. 692. 626. 706
mod2 %>% broom::glance()
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 993. 716 -315. 647. 683. 631. 708
mod3 %>% broom::glance()
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 954. 715 -326. 663. 686. 653. 711
mod4 %>% broom::glance()
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 954. 715 -311. 644. 694. 622. 705
Add predictions into dataframe.
Perform a logistic transformation on the score.
#score/predict results
df_val_norm$score1 <- predict(mod1, newdata = df_val_norm)
df_val_norm$score2 <- predict(mod2, newdata = df_val_norm)
df_val_norm$score3 <- predict(mod3, newdata = df_val_norm)
df_val_norm$score4 <- predict(mod4, newdata = df_val_norm)
df_val_norm = df_val_norm %>% mutate(score1_prob = exp(score1)/(1 + exp(score1)),
score2_prob = exp(score2)/(1 + exp(score2)),
score3_prob = exp(score3)/(1 + exp(score3)),
score4_prob = exp(score4)/(1 + exp(score4)))
df_val_norm %>% glimpse()
## Observations: 179
## Variables: 22
## $ PassengerId <int> 2, 3, 11, 17, 21, 25, 30, 31, 38, 42, 52, 54, 56, ...
## $ Survived <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ Pclass <ord> 1, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 3, 2,...
## $ Sex <fct> female, female, female, male, male, female, male, ...
## $ Age <dbl> 0.66141600, -0.25437350, -1.93332092, -2.08595250,...
## $ SibSp <ord> 1, 0, 1, 4, 0, 3, 0, 0, 0, 1, 0, 1, 0, 0, 0, 4, 0,...
## $ Fare <dbl> 0.73844524, -0.48862006, -0.31867392, -0.07803795,...
## $ Embarked <fct> C, S, S, Q, S, S, S, C, S, S, S, S, S, S, C, S, S,...
## $ Title <fct> Mrs, Miss, Miss, Master, Mr, Miss, Mr, Don, Mr, Mr...
## $ Group <fct> Mrs, Miss, Miss, Master, Officer, Miss, Officer, R...
## $ Family <ord> 1, 0, 2, 5, 0, 4, 0, 0, 0, 1, 0, 1, 0, 0, 0, 6, 0,...
## $ ParCh <ord> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,...
## $ Child <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALS...
## $ CabinLtr <fct> C, NA, G, NA, NA, NA, NA, NA, NA, NA, NA, NA, C, N...
## $ score1 <dbl> 3.24401840, -0.09164284, -0.09164284, 0.46500478, ...
## $ score2 <dbl> 2.98336574, 0.08845412, 0.51124572, -0.03955233, -...
## $ score3 <dbl> 2.3989170, 0.3594953, 1.0473986, -1.4839915, -1.33...
## $ score4 <dbl> 3.35298133, -0.22588905, 0.21267932, 0.60168222, -...
## $ score1_prob <dbl> 0.9624576, 0.4771053, 0.4771053, 0.6142008, 0.1784...
## $ score2_prob <dbl> 0.95181696, 0.52209912, 0.62509845, 0.49011321, 0....
## $ score3_prob <dbl> 0.91674468, 0.58891825, 0.74027505, 0.18482529, 0....
## $ score4_prob <dbl> 0.96620233, 0.44376665, 0.55297031, 0.64604108, 0....
View how the transformed scores are classified using boxplots.
#boxplots for scores
score_cols <- c("score1_prob","score2_prob","score3_prob","score4_prob")
score_cols %>% walk(gg_boxplot, df_val_norm)
View how the transformed scores are classified using dotplots.
# dotplots for scores
gg_dotplot <- function(x_col, data, bins = 60){
if(is.numeric(data[[x_col]])){
binwidth <- (max(data[,x_col]) - min(data[,x_col])) / bins
plt <- ggplot(data, aes_string(x_col)) +
geom_dotplot(dotsize = 0.5, method = "histodot", binwidth = binwidth) +
facet_wrap(~Survived)+
labs(title=str_c("Survived vs. ", x_col))
plt %>% print()
}
}
score_cols %>% walk(gg_dotplot, df_val_norm)
Create functions to find optimal classification thresholds for the models.
#threshold functions
threshs = seq(0.35, 0.70, by=0.05)
test_threshold1 <- function(thresh){
df_val_norm <- df_val_norm %>% mutate(score_pred1 = if_else(score1_prob > thresh, TRUE, FALSE) %>% as.factor())
cat('For mod1 threshold of ', thresh, ' performance is: \n')
cm <- confusionMatrix(data=df_val_norm$score_pred1, reference = (df_val_norm$Survived %>% as.factor()), mode = "prec_recall", positive="TRUE")
print((cm$overall[c('Accuracy','Kappa')]))
cat('\n')
print(cm$byClass[c('Specificity','Sensitivity','Balanced Accuracy','F1')])
cat('\n')
}
test_threshold2 <- function(thresh){
df_val_norm <- df_val_norm %>% mutate(score_pred2 = if_else(score2_prob > thresh, TRUE, FALSE) %>% as.factor())
cat('For mod2 threshold of ', thresh, ' performance is: \n')
cm <- confusionMatrix(data=df_val_norm$score_pred2, reference = (df_val_norm$Survived %>% as.factor()), mode = "prec_recall", positive="TRUE")
print((cm$overall[c('Accuracy','Kappa')]))
cat('\n')
print(cm$byClass[c('Specificity','Sensitivity','Balanced Accuracy','F1')])
cat('\n')
}
test_threshold3 <- function(thresh){
df_val_norm <- df_val_norm %>% mutate(score_pred3 = if_else(score3_prob > thresh, TRUE, FALSE) %>% as.factor())
cat('For mod3 threshold of ', thresh, ' performance is: \n')
cm <- confusionMatrix(data=df_val_norm$score_pred3, reference = (df_val_norm$Survived %>% as.factor()), mode = "prec_recall", positive="TRUE")
print((cm$overall[c('Accuracy','Kappa')]))
cat('\n')
print(cm$byClass[c('Specificity','Sensitivity','Balanced Accuracy','F1')])
cat('\n')
}
test_threshold4 <- function(thresh){
df_val_norm <- df_val_norm %>% mutate(score_pred4 = if_else(score4_prob > thresh, TRUE, FALSE) %>% as.factor())
cat('For mod4 threshold of ', thresh, ' performance is: \n')
cm <- confusionMatrix(data=df_val_norm$score_pred4, reference = (df_val_norm$Survived %>% as.factor()), mode = "prec_recall", positive="TRUE")
print((cm$overall[c('Accuracy','Kappa')]))
cat('\n')
print(cm$byClass[c('Specificity','Sensitivity','Balanced Accuracy','F1')])
cat('\n')
}
mod1 performs best at 0.55 with an accuracy of 0.8100559.
mod2 performs best at 0.55 and 0.70 with an accuracy of 0.8100559.
mod3 performs best at 0.60 and 0.65 with an accuracy of 0.8268156.
mod4 performs best at 0.65 with an accuracy of 0.8100559.
threshs %>% walk(test_threshold1)
## For mod1 threshold of 0.35 performance is:
## Accuracy Kappa
## 0.7374302 0.4932233
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.6306306 0.9117647 0.7711977 0.7251462
##
## For mod1 threshold of 0.4 performance is:
## Accuracy Kappa
## 0.7877095 0.5713926
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7477477 0.8529412 0.8003445 0.7532468
##
## For mod1 threshold of 0.45 performance is:
## Accuracy Kappa
## 0.7877095 0.5666964
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7657658 0.8235294 0.7946476 0.7466667
##
## For mod1 threshold of 0.5 performance is:
## Accuracy Kappa
## 0.7932961 0.5625206
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8288288 0.7352941 0.7820615 0.7299270
##
## For mod1 threshold of 0.55 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
##
## For mod1 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.7932961 0.5333615
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5735294 0.7507287 0.6782609
##
## For mod1 threshold of 0.7 performance is:
## Accuracy Kappa
## 0.7988827 0.5389898
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9549550 0.5441176 0.7495363 0.6727273
threshs %>% walk(test_threshold2)
## For mod2 threshold of 0.35 performance is:
## Accuracy Kappa
## 0.7541899 0.5193458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.6666667 0.8970588 0.7818627 0.7349398
##
## For mod2 threshold of 0.4 performance is:
## Accuracy Kappa
## 0.7765363 0.5560516
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7117117 0.8823529 0.7970323 0.7500000
##
## For mod2 threshold of 0.45 performance is:
## Accuracy Kappa
## 0.7821229 0.5565085
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7567568 0.8235294 0.7901431 0.7417219
##
## For mod2 threshold of 0.5 performance is:
## Accuracy Kappa
## 0.7877095 0.5569884
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8018018 0.7647059 0.7832538 0.7323944
##
## For mod2 threshold of 0.55 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod2 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.7932961 0.5444039
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8918919 0.6323529 0.7621224 0.6991870
##
## For mod2 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.7932961 0.5333615
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5735294 0.7507287 0.6782609
##
## For mod2 threshold of 0.7 performance is:
## Accuracy Kappa
## 0.8100559 0.5646015
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9639640 0.5588235 0.7613937 0.6909091
threshs %>% walk(test_threshold3)
## For mod3 threshold of 0.35 performance is:
## Accuracy Kappa
## 0.7541899 0.5116567
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.6936937 0.8529412 0.7733174 0.7250000
##
## For mod3 threshold of 0.4 performance is:
## Accuracy Kappa
## 0.7709497 0.5362987
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7387387 0.8235294 0.7811341 0.7320261
##
## For mod3 threshold of 0.45 performance is:
## Accuracy Kappa
## 0.7932961 0.5722405
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7927928 0.7941176 0.7934552 0.7448276
##
## For mod3 threshold of 0.5 performance is:
## Accuracy Kappa
## 0.7877095 0.5569884
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8018018 0.7647059 0.7832538 0.7323944
##
## For mod3 threshold of 0.55 performance is:
## Accuracy Kappa
## 0.7821229 0.5388731
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8198198 0.7205882 0.7702040 0.7153285
##
## For mod3 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.8268156 0.6182844
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9189189 0.6764706 0.7976948 0.7479675
##
## For mod3 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.8268156 0.6113874
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9459459 0.6323529 0.7891494 0.7350427
##
## For mod3 threshold of 0.7 performance is:
## Accuracy Kappa
## 0.8044693 0.5558942
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9459459 0.5735294 0.7597377 0.6902655
threshs %>% walk(test_threshold4)
## For mod4 threshold of 0.35 performance is:
## Accuracy Kappa
## 0.7709497 0.5485637
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.6936937 0.8970588 0.7953763 0.7484663
##
## For mod4 threshold of 0.4 performance is:
## Accuracy Kappa
## 0.7821229 0.5636602
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7297297 0.8676471 0.7986884 0.7515924
##
## For mod4 threshold of 0.45 performance is:
## Accuracy Kappa
## 0.7877095 0.5594560
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.7927928 0.7794118 0.7861023 0.7361111
##
## For mod4 threshold of 0.5 performance is:
## Accuracy Kappa
## 0.7821229 0.5414778
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8108108 0.7352941 0.7730525 0.7194245
##
## For mod4 threshold of 0.55 performance is:
## Accuracy Kappa
## 0.7877095 0.5468354
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8378378 0.7058824 0.7718601 0.7164179
##
## For mod4 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
##
## For mod4 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.8100559 0.5775958
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9189189 0.6323529 0.7756359 0.7166667
##
## For mod4 threshold of 0.7 performance is:
## Accuracy Kappa
## 0.7988827 0.5389898
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9549550 0.5441176 0.7495363 0.6727273
Perform a second pass to find the optimal threshold for each model.
mod1 performs best at 0.54.
mod2 performs best at 0.70 and 0.55.
mod3 performs best at 0.61.
mod4 performs best at 0.67.
mod3 has the highest accuracy at 83.24%. This will be the final glm model chosen.
#find more specific threshold for mod1
threshs = seq(0.50, 0.60, by=0.01)
threshs %>% walk(test_threshold1)
## For mod1 threshold of 0.5 performance is:
## Accuracy Kappa
## 0.7932961 0.5625206
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8288288 0.7352941 0.7820615 0.7299270
##
## For mod1 threshold of 0.51 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.52 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.53 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.54 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.55 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.56 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.57 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod1 threshold of 0.58 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
##
## For mod1 threshold of 0.59 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
##
## For mod1 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
#find more specific threshold for mod2
threshs = seq(0.55, 0.70, by=0.01)
threshs %>% walk(test_threshold2)
## For mod2 threshold of 0.55 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod2 threshold of 0.56 performance is:
## Accuracy Kappa
## 0.8100559 0.5874458
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6911765 0.7870297 0.7343750
##
## For mod2 threshold of 0.57 performance is:
## Accuracy Kappa
## 0.7988827 0.5606164
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6617647 0.7723238 0.7142857
##
## For mod2 threshold of 0.58 performance is:
## Accuracy Kappa
## 0.7932961 0.5470834
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6470588 0.7649709 0.7040000
##
## For mod2 threshold of 0.59 performance is:
## Accuracy Kappa
## 0.7877095 0.5334705
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6323529 0.7576179 0.6935484
##
## For mod2 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.7932961 0.5444039
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8918919 0.6323529 0.7621224 0.6991870
##
## For mod2 threshold of 0.61 performance is:
## Accuracy Kappa
## 0.8044693 0.5664660
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9099099 0.6323529 0.7711314 0.7107438
##
## For mod2 threshold of 0.62 performance is:
## Accuracy Kappa
## 0.8044693 0.5664660
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9099099 0.6323529 0.7711314 0.7107438
##
## For mod2 threshold of 0.63 performance is:
## Accuracy Kappa
## 0.7932961 0.5361720
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9189189 0.5882353 0.7535771 0.6837607
##
## For mod2 threshold of 0.64 performance is:
## Accuracy Kappa
## 0.7932961 0.5333615
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5735294 0.7507287 0.6782609
##
## For mod2 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.7932961 0.5333615
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5735294 0.7507287 0.6782609
##
## For mod2 threshold of 0.66 performance is:
## Accuracy Kappa
## 0.7932961 0.5333615
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5735294 0.7507287 0.6782609
##
## For mod2 threshold of 0.67 performance is:
## Accuracy Kappa
## 0.7877095 0.5192933
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5588235 0.7433757 0.6666667
##
## For mod2 threshold of 0.68 performance is:
## Accuracy Kappa
## 0.7877095 0.5192933
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.5588235 0.7433757 0.6666667
##
## For mod2 threshold of 0.69 performance is:
## Accuracy Kappa
## 0.7988827 0.5418089
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9459459 0.5588235 0.7523847 0.6785714
##
## For mod2 threshold of 0.7 performance is:
## Accuracy Kappa
## 0.8100559 0.5646015
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9639640 0.5588235 0.7613937 0.6909091
#find more specific threshold for mod3
threshs = seq(0.60, 0.65, by=0.01)
threshs %>% walk(test_threshold3)
## For mod3 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.8268156 0.6182844
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9189189 0.6764706 0.7976948 0.7479675
##
## For mod3 threshold of 0.61 performance is:
## Accuracy Kappa
## 0.8324022 0.6295019
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.6764706 0.8021993 0.7540984
##
## For mod3 threshold of 0.62 performance is:
## Accuracy Kappa
## 0.8212291 0.6024431
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9279279 0.6470588 0.7874934 0.7333333
##
## For mod3 threshold of 0.63 performance is:
## Accuracy Kappa
## 0.8268156 0.6137139
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9369369 0.6470588 0.7919979 0.7394958
##
## For mod3 threshold of 0.64 performance is:
## Accuracy Kappa
## 0.8324022 0.6250524
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9459459 0.6470588 0.7965024 0.7457627
##
## For mod3 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.8268156 0.6113874
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9459459 0.6323529 0.7891494 0.7350427
#find more specific threshold for mod4
threshs = seq(0.60, 0.70, by=0.01)
threshs %>% walk(test_threshold4)
## For mod4 threshold of 0.6 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
##
## For mod4 threshold of 0.61 performance is:
## Accuracy Kappa
## 0.8044693 0.5740703
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8828829 0.6764706 0.7796767 0.7244094
##
## For mod4 threshold of 0.62 performance is:
## Accuracy Kappa
## 0.8100559 0.5850266
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8918919 0.6764706 0.7841812 0.7301587
##
## For mod4 threshold of 0.63 performance is:
## Accuracy Kappa
## 0.8044693 0.5715653
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.8918919 0.6617647 0.7768283 0.7200000
##
## For mod4 threshold of 0.64 performance is:
## Accuracy Kappa
## 0.8044693 0.5690307
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9009009 0.6470588 0.7739799 0.7154472
##
## For mod4 threshold of 0.65 performance is:
## Accuracy Kappa
## 0.8100559 0.5775958
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9189189 0.6323529 0.7756359 0.7166667
##
## For mod4 threshold of 0.66 performance is:
## Accuracy Kappa
## 0.8212291 0.5976398
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9459459 0.6176471 0.7817965 0.7241379
##
## For mod4 threshold of 0.67 performance is:
## Accuracy Kappa
## 0.8268156 0.6090326
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9549550 0.6176471 0.7863010 0.7304348
##
## For mod4 threshold of 0.68 performance is:
## Accuracy Kappa
## 0.8212291 0.5951943
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9549550 0.6029412 0.7789481 0.7192982
##
## For mod4 threshold of 0.69 performance is:
## Accuracy Kappa
## 0.8044693 0.5531702
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9549550 0.5588235 0.7568892 0.6846847
##
## For mod4 threshold of 0.7 performance is:
## Accuracy Kappa
## 0.7988827 0.5389898
##
## Specificity Sensitivity Balanced Accuracy F1
## 0.9549550 0.5441176 0.7495363 0.6727273
Create a Random Forest model to see if results can be improved.
set.seed(123)
mod5 <- randomForest(as.factor(Survived) ~ Pclass+Sex+Age+SibSp+Fare+Embarked+Title+Group+Family+CabinLtr,
ntree=500, mtry=7, data=df_train_norm, importance=TRUE)
mod5
##
## Call:
## randomForest(formula = as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title + Group + Family + CabinLtr, data = df_train_norm, ntree = 500, mtry = 7, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 17.6%
## Confusion matrix:
## FALSE TRUE class.error
## FALSE 387 54 0.1224490
## TRUE 72 203 0.2618182
#visualize feature importance
varImpPlot(mod5)
# Use For loop to identify the right mtry for mod5
a=c()
i=5
for (i in 3:8) {
set.seed(123)
mod5 <- randomForest(as.factor(Survived) ~ Pclass+Sex+Age+Fare+Embarked+Title+Family+CabinLtr+Group,
data=df_train_norm, ntree = 500, mtry = i, importance = TRUE)
df_train_norm$score_pred5 <- predict(mod5, df_train_norm, type = "class")
a[i-2] = mean(df_train_norm$score_pred5 == df_train_norm$Survived)
}
a
## [1] 0.9371508 0.9525140 0.9734637 0.9874302 0.9888268 0.9888268
plot(3:8,a)
Add predictions into dataframe.
mod5 has an accuracy of 98.88% with training data. Using validation data it has an accuracy of 79.89%. The model is overfit but the accuracy still isn’t bad.
#score/predict results for random forest models
df_train_norm$score_pred5 <- predict(mod5, df_train_norm, type="class")
mean(df_train_norm$score_pred5 == df_train_norm$Survived)
## [1] 0.9888268
table(df_train_norm$score_pred5,df_train_norm$Survived)
##
## FALSE TRUE
## FALSE 439 6
## TRUE 2 269
df_val_norm$score_pred5 <- predict(mod5, newdata = df_val_norm)
mean(df_val_norm$score_pred5 == df_val_norm$Survived)
## [1] 0.7988827
table(df_val_norm$score_pred5, df_val_norm$Survived)
##
## FALSE TRUE
## FALSE 89 14
## TRUE 22 54
Run a KNN model to see if we can improve results. First, prepare the data to work with a KNN model better.
# set up new dataframe to turn factors into binary dummy variables for knn model
df_num_norm <- df_norm %>% mutate_if(is.ordered, as.character)
df_num_norm <- df_num_norm %>% mutate_if(is.character, as.factor)
df_num_norm <- df_num_norm %>% mutate(Child = as.factor(Child))
df_num_norm <- fastDummies::dummy_cols(df_num_norm)
df_num_norm <- df_num_norm %>% select(1:2,Age,Fare,15:81)
df_num_norm %>% glimpse()
## Observations: 1,316
## Variables: 71
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...
## $ Survived <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FA...
## $ Age <dbl> -0.55963667, 0.66141600, -0.25437350, 0.4...
## $ Fare <dbl> -0.50169283, 0.73844524, -0.48862006, 0.3...
## $ Pclass_3 <int> 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,...
## $ Pclass_1 <int> 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
## $ Pclass_2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ Sex_male <int> 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ Sex_female <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,...
## $ SibSp_1 <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,...
## $ SibSp_0 <int> 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ SibSp_3 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ SibSp_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ SibSp_2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ SibSp_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ SibSp_8 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Embarked_S <int> 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ Embarked_C <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ Embarked_Q <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Embarked_NA <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Mr <int> 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ Title_Mrs <int> 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,...
## $ Title_Miss <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,...
## $ Title_Master <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Title_Don <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Rev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Dr <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Mme <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Ms <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Major <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Lady <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Sir <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Mlle <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Col <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Capt <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ `Title_the Countess` <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Jonkheer <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Title_Dona <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Group_Officer <int> 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ Group_Mrs <int> 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,...
## $ Group_Miss <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,...
## $ Group_Master <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Group_Royalty <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Family_1 <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ Family_0 <int> 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,...
## $ Family_4 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Family_2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,...
## $ Family_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ Family_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Family_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Family_7 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Family_10 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ParCh_0 <int> 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,...
## $ ParCh_1 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
## $ ParCh_2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ ParCh_5 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ ParCh_3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ParCh_4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ParCh_6 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ParCh_9 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Child_FALSE <int> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,...
## $ Child_TRUE <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,...
## $ CabinLtr_NA <int> 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,...
## $ CabinLtr_C <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ CabinLtr_E <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ CabinLtr_G <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ CabinLtr_D <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CabinLtr_A <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CabinLtr_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CabinLtr_F <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CabinLtr_T <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
#create test, train, and validation set
df_num_test_norm <- df_num_norm %>% filter(is.na(Survived))
set.seed(123)
df_num_train_norm <- df_num_norm %>% filter(!is.na(Survived)) %>% sample_frac(0.8)
df_num_val_norm <- df_num_norm %>% filter(!is.na(Survived)) %>% setdiff(df_num_train_norm)
Create the KNN model.
The features from mod1 appear to produce the highest accuracy in the KNN model.
#create function to find accuracy
accuracy <- function(x){
num <- x %>% filter(x$Survived == x$score) %>% nrow()
(num/nrow(x)) * 100
}
#use names function to get variable numbers easier
df_num_train_norm %>% names()
## [1] "PassengerId" "Survived" "Age"
## [4] "Fare" "Pclass_3" "Pclass_1"
## [7] "Pclass_2" "Sex_male" "Sex_female"
## [10] "SibSp_1" "SibSp_0" "SibSp_3"
## [13] "SibSp_4" "SibSp_2" "SibSp_5"
## [16] "SibSp_8" "Embarked_S" "Embarked_C"
## [19] "Embarked_Q" "Embarked_NA" "Title_Mr"
## [22] "Title_Mrs" "Title_Miss" "Title_Master"
## [25] "Title_Don" "Title_Rev" "Title_Dr"
## [28] "Title_Mme" "Title_Ms" "Title_Major"
## [31] "Title_Lady" "Title_Sir" "Title_Mlle"
## [34] "Title_Col" "Title_Capt" "Title_the Countess"
## [37] "Title_Jonkheer" "Title_Dona" "Group_Officer"
## [40] "Group_Mrs" "Group_Miss" "Group_Master"
## [43] "Group_Royalty" "Family_1" "Family_0"
## [46] "Family_4" "Family_2" "Family_6"
## [49] "Family_5" "Family_3" "Family_7"
## [52] "Family_10" "ParCh_0" "ParCh_1"
## [55] "ParCh_2" "ParCh_5" "ParCh_3"
## [58] "ParCh_4" "ParCh_6" "ParCh_9"
## [61] "Child_FALSE" "Child_TRUE" "CabinLtr_NA"
## [64] "CabinLtr_C" "CabinLtr_E" "CabinLtr_G"
## [67] "CabinLtr_D" "CabinLtr_A" "CabinLtr_B"
## [70] "CabinLtr_F" "CabinLtr_T"
#create function to find best value for k
ks = seq(1,29,by=2)
for(k in ks) {
mod6 <- knn(train=df_num_train_norm[c(5:7,17:20,39:43)], test=df_num_val_norm[c(5:7,17:20,39:43)], cl=df_num_train_norm$Survived, k=k)
df_num_train_norm$score <- mod6
cat(c("k=",k,":",accuracy(df_num_train_norm),"\n"))
}
## k= 1 : 54.608938547486
## k= 3 : 54.0502793296089
## k= 5 : 52.9329608938548
## k= 7 : 55.1675977653631
## k= 9 : 55.4469273743017
## k= 11 : 55.7262569832402
## k= 13 : 55.4469273743017
## k= 15 : 56.0055865921788
## k= 17 : 55.7262569832402
## k= 19 : 55.1675977653631
## k= 21 : 56.8435754189944
## k= 23 : 56.2849162011173
## k= 25 : 57.4022346368715
## k= 27 : 58.2402234636871
## k= 29 : 58.2402234636871
#create knn model based on highest value from above
mod6 <- knn(train=df_num_train_norm[c(5:7,17:20,39:43)], test=df_num_val_norm[c(5:7,17:20,39:43)], cl=df_num_train_norm$Survived, k=27)
Add predictions into dataframe.
mod6 has an accuracy of 58.24%. This is very low so the model will not be used.
#add predictions to data frame
df_num_train_norm$score <- mod6
#create confusion matrix
CrossTable(x=df_num_train_norm$Survived, y=df_num_train_norm$score)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 716
##
##
## | df_num_train_norm$score
## df_num_train_norm$Survived | FALSE | TRUE | Row Total |
## ---------------------------|-----------|-----------|-----------|
## FALSE | 359 | 82 | 441 |
## | 0.050 | 0.207 | |
## | 0.814 | 0.186 | 0.616 |
## | 0.623 | 0.586 | |
## | 0.501 | 0.115 | |
## ---------------------------|-----------|-----------|-----------|
## TRUE | 217 | 58 | 275 |
## | 0.081 | 0.333 | |
## | 0.789 | 0.211 | 0.384 |
## | 0.377 | 0.414 | |
## | 0.303 | 0.081 | |
## ---------------------------|-----------|-----------|-----------|
## Column Total | 576 | 140 | 716 |
## | 0.804 | 0.196 | |
## ---------------------------|-----------|-----------|-----------|
##
##
accuracy(df_num_train_norm)
## [1] 58.24022
Compare mod3 and mod5.
mod3 has a higher accuracy of 83.24% vs. mod5 at 79.89%.
mod3 will be the final model used to predict survival.
#add final thresholds for mod3
df_val_norm <- df_val_norm %>% mutate(score_pred3 = if_else(score3_prob > 0.61, TRUE, FALSE) %>% as.factor())
#confusion matrices
confusionMatrix(data=df_val_norm$score_pred3, reference = (df_val_norm$Survived %>% as.factor()), mode = "prec_recall", positive="TRUE")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 103 22
## TRUE 8 46
##
## Accuracy : 0.8324
## 95% CI : (0.7695, 0.884)
## No Information Rate : 0.6201
## P-Value [Acc > NIR] : 4.773e-10
##
## Kappa : 0.6295
##
## Mcnemar's Test P-Value : 0.01762
##
## Precision : 0.8519
## Recall : 0.6765
## F1 : 0.7541
## Prevalence : 0.3799
## Detection Rate : 0.2570
## Detection Prevalence : 0.3017
## Balanced Accuracy : 0.8022
##
## 'Positive' Class : TRUE
##
confusionMatrix(data=df_val_norm$score_pred5, reference = (df_val_norm$Survived %>% as.factor()), mode = "prec_recall", positive="TRUE")
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 89 14
## TRUE 22 54
##
## Accuracy : 0.7989
## 95% CI : (0.7326, 0.855)
## No Information Rate : 0.6201
## P-Value [Acc > NIR] : 2.01e-07
##
## Kappa : 0.5826
##
## Mcnemar's Test P-Value : 0.2433
##
## Precision : 0.7105
## Recall : 0.7941
## F1 : 0.7500
## Prevalence : 0.3799
## Detection Rate : 0.3017
## Detection Prevalence : 0.4246
## Balanced Accuracy : 0.7980
##
## 'Positive' Class : TRUE
##
View ROC curve and AUC as another check for model performance.
The ROC curve looks pretty good and the AUC is 87.62%.
#ROC curve and AUC
p <- predict(mod3, newdata=subset(df_val_norm,select=(-Survived)), type="response")
pr <- prediction(p, df_val_norm$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8761924
Finally, lets predict results with the original df_test data frame.
#apply model
df_test_norm$score <- predict(mod3, newdata = df_test_norm)
df_final <- df_test_norm %>% mutate(score_prob = exp(score)/(1 + exp(score)),
Survived = if_else(score_prob > 0.61, TRUE, FALSE) %>% as.numeric(),
PassengerId = as.double(PassengerId)) %>%
select(PassengerId,Survived)
df_final <- df_final %>% distinct()
df_final %>% return()
## # A tibble: 418 x 2
## PassengerId Survived
## <dbl> <dbl>
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 1
## 6 897 0
## 7 898 0
## 8 899 0
## 9 900 1
## 10 901 0
## # ... with 408 more rows
Export results.
write_csv(df_final, path=str_c(path, "titanic_predictions-v5.csv"))