9 Régression sur composantes : PCR et PLS

Régression MCO et choix de variables

ozone <- read.table("../donnees/ozone.txt",header=TRUE,sep=";",row.names=1)
modeleinit <- lm(O3 ~ ., data = ozone[,1:10])
round(coefficients(modeleinit),2)
(Intercept)         T12         T15        Ne12         N12         S12 
      54.73       -0.35        1.50       -4.19        1.28        3.17 
        E12         W12          Vx         O3v 
       0.53        2.47        0.61        0.25 
BIC(modeleinit)
[1] 431.8923
library(leaps)
choix <- regsubsets(O3 ~ .,nbest=1,nvmax=10,data=ozone[,1:10])
resume <- summary(choix)
indmin <- which.min(resume$bic)
nomselec <- colnames(resume$which)[resume$which[indmin,]][-1]
formule <- formula(paste("O3~",paste(nomselec,collapse="+")))
modeleBIC <- lm(formule,data=ozone[,1:10])
round(coefficients(modeleBIC),2)
(Intercept)         T15        Ne12          Vx         O3v 
      61.83        1.06       -3.99        0.31        0.26 
BIC(modeleBIC)
[1] 415.8866

Mise en place des données centrées réduites

X <- ozone[,2:10]
Xbar  <- apply(X, 2, mean)
stdX <- sqrt(apply(X, 2, var))
Xcr <- scale(X, center = Xbar, scale = stdX)

PCR

library(pls)
set.seed(87)
cvseg <- cvsegments(nrow(ozone), k = 4, type = "random")
n.app <- nrow(ozone)
modele.pcr <- pcr(O3 ~ ., ncomp=9, data=ozone[,1:10], scale=T,
                   validation = "CV", segments = cvseg)
msepcv.pcr <- MSEP(modele.pcr ,estimate=c("train","CV")) 
msepcv.pcr
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
train        559.8    188.7    186.5    185.2    179.8    158.8    152.0
CV           582.9    260.4    260.6    278.2    271.3    239.3    248.1
       7 comps  8 comps  9 comps
train    143.9    142.5    139.7
CV       242.1    244.0    239.4
npcr <- which.min(msepcv.pcr$val["CV",,])-1
modele.pcr.fin <- pcr(O3 ~ ., ncomp = npcr, scale = TRUE,data = ozone[,1:10])
X <- ozone[,2:10]
Y <- ozone[,1]
n <- nrow(X)
Xbar  <- apply(X,2,mean)
stdX <- sqrt(apply(X,2,var)*(n-1)/n)
Ybar <- mean(Y)
modele.pcr.fin <- pcr(O3~.,ncomp=1,scale=TRUE,data =ozone[,1:10])
betafinpcr <- matrix(coefficients(modele.pcr.fin),ncol=1)/stdX
mu <- mean(Y)-Xbar%*%betafinpcr
c(mu,betafinpcr)
 [1] 48.3373569  0.7884931  0.8236705 -1.7618991 -0.6760911  0.2126764
 [7]  1.6099503 -1.4694047  0.3118176  0.1662505

PLS

set.seed(87)
cvseg <- cvsegments(nrow(ozone), k = 4, type = "random")
n.app <- nrow(ozone)
modele.pls <- plsr(O3 ~ ., ncomp=9, data = ozone[,1:10], scale=T,validation = "CV", segments = cvseg)
msepcv.pls <- MSEP(modele.pls ,estimate=c("train","CV")) 
msepcv.pls
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
train        559.8    173.9    150.9    146.9    144.2    142.1    141.4
CV           582.9    251.9    248.4    245.3    234.6    241.7    243.6
       7 comps  8 comps  9 comps
train    140.9    139.8    139.7
CV       234.7    239.6    239.4
npls <- which.min(msepcv.pls$val["CV",,])-1
modele.pls.fin <- plsr(O3~ . , ncomp  =npls, scale = TRUE,data = ozone[,1:10])
X <- ozone[,2:10]
Y <- ozone[,1]
n <- nrow(X)
Xbar  <- apply(X,2,mean)
stdX <- sqrt(apply(X,2,var)*(n-1)/n)
Ybar <- mean(Y)
modele.pls.fin <- plsr(O3~.,ncomp=1,scale=TRUE,data =ozone[,1:10])
betafinpls <- matrix(coefficients(modele.pls.fin),ncol=1)/stdX
mu <- mean(Y)-Xbar%*%betafinpls
c(mu,betafinpls)
 [1] 47.1060475  0.8151834  0.8471154 -2.1910455 -0.5175855  0.3563688
 [7]  1.3005636 -1.2552108  0.2805513  0.1920499