It is a method used to assess out of generalizable performace of the model. Often we do a random train test split to assess the model performance. However, this is not a robust measure as the superior (inferior) performance of the model may be a consequence of the choice of test data set. What if we fit the model to different training sets and then test out on another training set. This is the basic idea behind cross validation. We divide the data into various (say \(k\)) folds. We will test the model on one of the folds and build model on the others. The overall performance across all \(k\) folds is taken as a measure of performance. Let’s first load the required libraries and build a model.
library(ISLR)
library(ggplot2)
library(boot)
data("Auto")
We will use various models to compare performances. Lets split into train and test set respectively.
train = sample(392,196)
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
Auto.tr <- Auto[train,]
lm.fit <- lm(mpg ~ horsepower, data = Auto, subset = train)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.4313 -3.5043 -0.5918 2.8185 17.0250
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.194903 1.037981 38.72 <2e-16 ***
## horsepower -0.163383 0.009667 -16.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.791 on 194 degrees of freedom
## Multiple R-squared: 0.5955, Adjusted R-squared: 0.5934
## F-statistic: 285.6 on 1 and 194 DF, p-value: < 2.2e-16
lm.fit2 <- lm(mpg ~ poly(horsepower,2), data = Auto, subset = train)
summary(lm.fit2)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6219 -2.7437 -0.2672 2.6113 14.7910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.2652 0.3047 76.364 < 2e-16 ***
## poly(horsepower, 2)1 -119.1268 6.5507 -18.185 < 2e-16 ***
## poly(horsepower, 2)2 49.1896 6.6546 7.392 4.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.241 on 193 degrees of freedom
## Multiple R-squared: 0.6848, Adjusted R-squared: 0.6815
## F-statistic: 209.6 on 2 and 193 DF, p-value: < 2.2e-16
mean((mpg - predict(lm.fit2,Auto[-train]))^2)
## [1] 19.08639
lm.fit3 <- lm(mpg ~ poly(horsepower,3), data = Auto, subset = train)
summary(lm.fit3)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6595 -2.5948 -0.1822 2.2928 14.3703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.291 0.303 76.867 < 2e-16 ***
## poly(horsepower, 3)1 -120.035 6.526 -18.393 < 2e-16 ***
## poly(horsepower, 3)2 49.320 6.612 7.459 2.93e-12 ***
## poly(horsepower, 3)3 -12.587 6.707 -1.877 0.0621 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.213 on 192 degrees of freedom
## Multiple R-squared: 0.6905, Adjusted R-squared: 0.6856
## F-statistic: 142.8 on 3 and 192 DF, p-value: < 2.2e-16
mean((mpg - predict(lm.fit3,Auto))[-train]^2)
## [1] 21.06729
lm.fit4 <- lm(mpg ~ poly(horsepower,4), data = Auto, subset = train)
summary(lm.fit3)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6595 -2.5948 -0.1822 2.2928 14.3703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.291 0.303 76.867 < 2e-16 ***
## poly(horsepower, 3)1 -120.035 6.526 -18.393 < 2e-16 ***
## poly(horsepower, 3)2 49.320 6.612 7.459 2.93e-12 ***
## poly(horsepower, 3)3 -12.587 6.707 -1.877 0.0621 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.213 on 192 degrees of freedom
## Multiple R-squared: 0.6905, Adjusted R-squared: 0.6856
## F-statistic: 142.8 on 3 and 192 DF, p-value: < 2.2e-16
mean((mpg - predict(lm.fit4,Auto))[-train]^2)
## [1] 20.9698
Let’s assess the performance of these models in terms of their average MSE. Let us compare models of polynomial terms upto degree 4. We will do this automatically using a for loop.
cv.error.LOOCV<-rep(0,4)# An array7 of 0 repeated 4 times
cv.error.10fold<-rep(0,4)
for(j in 1:4){
model<-glm(mpg~poly(horsepower,j),data=Auto)#As family is not specified, the default Linear model is implemented.
cv.error.LOOCV[j]<-cv.glm(Auto,model)$delta[1]# Performs leave one outcross validation and returns the average MSE
cv.error.10fold[j]<-cv.glm(Auto,model,K=10)$delta[1]
}
Lets plot these errors.
ggplot() +
# Map a string label to 'color' inside aes()
geom_line(aes(x = 1:4, y = cv.error.LOOCV, color = "LOOCV")) +
geom_line(aes(x = 1:4, y = cv.error.10fold, color = "10-fold")) +
# Manually assign the red and blue colors to those labels
scale_color_manual(values = c("LOOCV" = "red", "10-fold" = "blue")) +
labs(
x = "Polynomial degree",
y = "Average MSE",
title = "Cross validation results",
color = "Method" # Legend title
)
We can do cross validation using the caret package also. The main
advantage of this is that caret allows you to get error metrics other
than the MSE. Rsquares, RMSE, MAE can be computed. Instead of
lm random forest can also be implmented.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
ctrl<- trainControl(method = "cv", number = 10)
form <- as.formula(paste("mpg ~ poly(horsepower,", j, ")"))
cv.error.new<-rep(0,10)
for (j in 1:10){
model_caret <- train(form,
data = Auto,
method = "lm",
trControl = ctrl)
cv.error.new[j] <- model_caret$results$MAE#Rsquares, RMSE,MAE can be compared. Instead of lm random forest can also be implmented
}
ggplot()+
geom_point(aes(x=1:10,y=cv.error.new));
It is another resampling method which creates multiple datasets from a given dataset by sampling observations without replacing. The predictions from each of the individual “bootstrapped samples” can be used to get a variance of the predicted quanitity.
data("Portfolio")
#define a function for computing something of interest, in this case alpha
alpha.fn = function(data, index){
X = data$X[index]
Y = data$Y[index]
return ((var(Y) - cov(X,Y))/(var(X) + var(Y) - 2*cov(X,Y)))
}
Let’s bootstrap by sampling from the 100 observations another 100 with replacement. This would result in multiple copies of the same observation.
library(boot)
boot(Portfolio, alpha.fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5758321 0.005534111 0.08942474
boot.fn = function(data, index){
lm.fit <- lm(mpg ~ horsepower, data = data, subset = index)
return (coef(lm.fit))
}
boot_results<-boot(Auto, boot.fn, R=1000)
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 39.9358610 0.0412360738 0.867544360
## t2* -0.1578447 -0.0004118463 0.007483388
Just like in the last code snippet, 1000 bootstrapped samples are created and point estimate from that is drawn.
boot.ci(boot_results, type = "perc",index=1)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_results, type = "perc", index = 1)
##
## Intervals :
## Level Percentile
## 95% (38.30, 41.63 )
## Calculations and Intervals on Original Scale
boot.ci(boot_results, type = "perc",index=2)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_results, type = "perc", index = 2)
##
## Intervals :
## Level Percentile
## 95% (-0.1734, -0.1446 )
## Calculations and Intervals on Original Scale