Install ISLR
library(ISLR)
summary(Hitters)
There are some missing values here, so before we processed we will remove them:
Hitters=na.omit(Hitters)
sum(is.na(Hitters$Salary))
Best subset regression
We will now use the package leaps
to evaluate all the best-subset models:
library(leaps)
regfit.full=regsubset(Salary~., data=Hitters)
summary(regfit.full)
It gives by default best-subset up to size 8; let's increases that to 19, i.e. all the variables
regfit.full=regsubsets(Salary~。, data=Hitters, nvmax=19)
reg.summary=summary(regfit.full)
names(reg.summary)
plot(reg.summary$cp, xlab="Number of Variables", ylab="Cp")
which.min(reg.summary$cp)
There is a plot method for the regsubsets
object
plot(regfit.full, scale="Cp")
coef(regfit.full, 10)
Forward Stepwise Selection
Forward stepwise is a greedy algorithm.Each time it includes the next best variable. But it produces a nested sequence.So it's a much less adventurous search.
Here we use the regsubsets
function but specify the method="forward" option:
regfit.fwd=regsubset(Salary~., data=Hitters, nvmax=19, method="forward")
summary(regfit.fwd)
plot(regfit.fwd, scale=“Cp”)
Model Selection Using a Validation Set
Lets make a training and validation set, so that we can choose a good subset model.We will do it using a slightly different approach from what was done in the book.
dim(Hitters)
263 20
set.seed(1)
train=sample(seq(263), 180, replace=FALSE)
regfit.fwd=regsubset(Salary~., data=Hitters[train,], nvmax=19, method="forward")
Now we will make predictions on the observations not used for training.
We know there are 19 models, so we set up some vectors to record the errors, we have to do a bit of work here, because there is no predict method for regsubset
val.errors=rep(NA, 19)
x.test=model.matrix(Salary~., data=Hitters[-train,]) # notice the -index!
for(i in 1:19){
coef=coef(regfit.fwd, id=i)
pred=x.test[, names(coefi)]%*%coefi
val.errors[i]=mean((Hitters$Salary[-train]-pred)^2)
}
plot(sqrt(val.erros), ylab="Root MSE", ylim=c(300, 400), pch=19, type="b")
points(sqrt(regfit.fwd$rss), col="blue", pch=19, type="b")
legend("topright", legend=c("Trainging","Validation"), col=c("blue", "black"), pch=19)
As we expect, the training error goes down monotonically as the model gets bigger, but not so for the validation error.
This was a little tedious - not having a predict method for regsubset
, so we will write one!
predict.regsubset=function(object, newdata, id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form.newdata) # the test matrix
coefi=coef(object, id=id)
mat[, names(coefi)]%*%coefi
}
Model Selection by Cross-Validation
We will do 10-fold cross-validation, its really easy!
set.seed(1)
folds=sample(rep(1:10), length=nrow(Hitters))
tables(fold)
1 2 3 4 5 6 7 8 9 10
27 27 27 26 26 26 26 26 26 26
cv.errors=matrix(NA, 10, 19)
for(k in 1:10){
best.fit=regsubset(Salary~., data=Hitters[fold!=k,], nvmax=19, method="foward")
for(i in 1:19){
pre=predict(best.fit, Hitters[fold==k,], id=i)
cv.errors[k, i]=mean((Hitters$Salary[fold==k]-pred)^2)
}
}
#use the apply function to the columns, average down the columns
rmse.cv=sqrt(apply(cv.errors,2,mean))
plot(rmse.cv, pch=19, type="b")
Ridge Regression and the Lasso
We will use the package glmnet
, which does not use the module formula language, so we will set up an x
and y
.
library(glmnet)
x=model.matrix(Salary~.-1, data=Hitters)
y=Hitters$Salary
First we will fit a ridge-regression model. This is achieved by calling glmnet
with alpha=0
.
There is also a cv.glmnet
function which will do the cross-validation for us.
fit.ridge=glmnet(x,y,alpha=0)
plot(fit.ridge, xvar="lambda",label=TRUE)
cv.ridge=cv.glmnet(x,y,alpha=0)
plot(cv.ridge)
Now we fit a lasso model; for this we use the default alpha=1
fit.lasso=glmnet(x,y)
plot(fit.lasso, xvar="lambda", label=TRUE)
#plot(fit.lasso, xvar="dev", label=TRUE)
cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)
coef(cv.lasso)
Suppose we want to use our earlier train/validation division to select the lambda
for the lasso.
lasso.tr=glmnet(x[train], y[train])
lasso.tr
pred=predict(lasso.tr, x[-train,])
dim(pred)
rmse=sqrt(apply((y[-train]-repd)^2, 2, mean))
plot(log(lasso.tr$lambda), rmse, type="b", xlab="Log(labmda)")
lam.best=lasso.tr$lambda[order(rmse[1])]
lam.best
coef(lasso.trm s=lam.best)