<- read.table("../donnees/artere.txt",header=T)
artere plot(chd~age,data=artere,pch=16)
11 Régression logistique
Présentation du modèle
<- table(artere$agrp,artere$chd)
tab_freq <- tab_freq[,2]/apply(tab_freq,1,sum)
freq cbind(tab_freq,round(freq,3))
0 1
1 9 1 0.100
2 13 2 0.133
3 9 3 0.250
4 10 5 0.333
5 7 6 0.462
6 3 5 0.625
7 4 13 0.765
8 2 8 0.800
<- c(19,29,34,39,44,49,54,59)
x.age plot(x.age,c(freq),type="s",xlim=c(18,80),ylim=c(0,1),xlab="âge",ylab="freq")
lines(c(59,80),rep(freq[length(freq)],2))
<- seq(15,80,by=0.01)
x <- exp(-5.31+0.11*x)/(1+exp(-5.31+0.11*x))
y lines(x,y,lty=3)
glm(chd~age,data=artere,family=binomial)
Call: glm(formula = chd ~ age, family = binomial, data = artere)
Coefficients:
(Intercept) age
-5.3095 0.1109
Degrees of Freedom: 99 Total (i.e. Null); 98 Residual
Null Deviance: 136.7
Residual Deviance: 107.4 AIC: 111.4
set.seed(12345)
<- factor(sample(c("A","B","C"),100,replace=T))
X #levels(X) <- c("A","B","C")
<- rep(0,100)
Y =="A"] <- rbinom(sum(X=="A"),1,0.9)
Y[X=="B"] <- rbinom(sum(X=="B"),1,0.1)
Y[X=="C"] <- rbinom(sum(X=="C"),1,0.9)
Y[X<- data.frame(X,Y)
donnees <- glm(Y~.,data=donnees,family=binomial)
model coef(model)
(Intercept) XB XC
2.0794415 -3.6549779 0.3772942
<- glm(Y~C(X,sum),data=donnees,family=binomial)
model1 coef(model1)
(Intercept) C(X, sum)1 C(X, sum)2
0.9868803 1.0925612 -2.5624167
Intervalles de confiance et tests
library(bestglm)
data(SAheart)
<- SAheart[c(2,408,35),]
new.SAheart row.names(new.SAheart) <- NULL
<- SAheart[-c(2,408,35),]
SAheart <- glm(chd~.,data=SAheart,family=binomial)
model round(summary(model)$coefficients,4)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.0837 1.3141 -4.6294 0.0000
sbp 0.0065 0.0058 1.1268 0.2598
tobacco 0.0814 0.0269 3.0232 0.0025
ldl 0.1794 0.0600 2.9891 0.0028
adiposity 0.0184 0.0295 0.6224 0.5337
famhistPresent 0.9325 0.2291 4.0694 0.0000
typea 0.0392 0.0123 3.1845 0.0015
obesity -0.0637 0.0446 -1.4300 0.1527
alcohol 0.0002 0.0045 0.0346 0.9724
age 0.0439 0.0122 3.5923 0.0003
confint.default(model)
2.5 % 97.5 %
(Intercept) -8.659355064 -3.507983650
sbp -0.004797560 0.017773140
tobacco 0.028628110 0.134174033
ldl 0.061770934 0.297043348
adiposity -0.039461469 0.076187468
famhistPresent 0.483354369 1.381572764
typea 0.015090098 0.063396115
obesity -0.151054913 0.023612482
alcohol -0.008639577 0.008950096
age 0.019931097 0.067793230
<- 1000
n set.seed(123)
<- sample(c("A","B","C"),n,replace=TRUE)
X1 <- rnorm(n)
X2 <- runif(n)
X3 <- 1+0*(X1=="A")+1*(X1=="B")-3*(X1=="C")+2*X2
cl <- rbinom(n,1,exp(cl)/(1+exp(cl)))
Y <- data.frame(X1,X2,X3,Y) donnees
<- glm(Y~.,data=donnees,family=binomial)
m1 library(car)
Anova(m1,type=3,test.statistic="Wald")
Analysis of Deviance Table (Type III tests)
Response: Y
Df Chisq Pr(>Chisq)
(Intercept) 1 28.4698 9.517e-08 ***
X1 2 212.5061 < 2.2e-16 ***
X2 1 210.3902 < 2.2e-16 ***
X3 1 0.3096 0.5779
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(m1,type=3,test.statistic="LR")
Analysis of Deviance Table (Type III tests)
Response: Y
LR Chisq Df Pr(>Chisq)
X1 376.74 2 <2e-16 ***
X2 417.66 1 <2e-16 ***
X3 0.31 1 0.5778
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
<- glm(Y~X2+X3,data=donnees,family=binomial)
m01 <- glm(Y~X1+X3,data=donnees,family=binomial)
m02 <- glm(Y~X1+X2,data=donnees,family=binomial)
m03 anova(m01,m1,test="LRT")
Analysis of Deviance Table
Model 1: Y ~ X2 + X3
Model 2: Y ~ X1 + X2 + X3
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 997 1109.67
2 995 732.93 2 376.74 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m02,m1,test="LRT")
Analysis of Deviance Table
Model 1: Y ~ X1 + X3
Model 2: Y ~ X1 + X2 + X3
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 996 1150.59
2 995 732.93 1 417.66 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m03,m1,test="LRT")
Analysis of Deviance Table
Model 1: Y ~ X1 + X2
Model 2: Y ~ X1 + X2 + X3
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 996 733.24
2 995 732.93 1 0.30976 0.5778
library(aod)
wald.test(Sigma=vcov(m1),b=coef(m1),Terms=c(2,3))
Wald test:
----------
Chi-squared test:
X2 = 212.5, df = 2, P(> X2) = 0.0
Prévisions
<- glm(chd~.,data=SAheart,family=binomial) model
<- SAheart[c(2,408,35),-10]
new.SAheart row.names(new.SAheart) <- NULL
new.SAheart
sbp tobacco ldl adiposity famhist typea obesity alcohol age
1 118 0.08 3.48 32.28 Present 52 29.14 3.81 46
2 178 20.00 9.78 33.55 Absent 37 27.29 2.88 62
3 140 3.90 7.32 25.05 Absent 47 27.36 36.77 32
predict(model, newdata=new.SAheart)
1 2 3
-0.9599837 1.5028033 -1.5743496
predict(model, newdata=new.SAheart,type="response")
1 2 3
0.2768815 0.8179922 0.1715972
<- predict(model,newdata=new.SAheart,type="link",se.fit = TRUE)
prev <- prev$fit-qnorm(0.975)*prev$se.fit
cl_inf <- prev$fit+qnorm(0.975)*prev$se.fit
cl_sup <- exp(cl_inf)/(1+exp(cl_inf))
binf <- exp(cl_sup)/(1+exp(cl_sup))
bsup data.frame(binf,bsup)
binf bsup
1 0.1774323 0.4046504
2 0.6040315 0.9297800
3 0.1024782 0.2731474
unique(artere[,"age"])
[1] 20 23 24 25 26 28 29 30 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
[26] 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 69
<- aggregate(artere[,"chd"],by=list(artere$age),FUN=mean)
sature names(sature) <- c("age","p")
<- aggregate(artere[,"chd"],by=list(artere$age),FUN=length)
ndesign names(ndesign) <- c("age","n")
merge(sature,ndesign,by="age")[1:5,]
age p n
1 20 0.0 1
2 23 0.0 1
3 24 0.0 1
4 25 0.5 2
5 26 0.0 2
plot(chd~age,data=artere,pch=15+chd,col=chd+1)
lines(p~age,data=sature)
<- glm(chd~.,data=SAheart,family=binomial)
model library(generalhoslem)
logitgof(obs= SAheart$chd, exp = fitted(model))
Hosmer and Lemeshow test (binary model)
data: SAheart$chd, fitted(model)
X-squared = 6.6586, df = 8, p-value = 0.5739
<- glm(chd~.,data=SAheart,family=binomial)
model <- predict(model)
prev_lin <- residuals(model,type="pearson") #Pearson
res_P <- rstandard(model,type="pearson") #Pearson standard
res_PS <- residuals(model,type="deviance") #Deviance
res_D <- rstandard(model,type="deviance") #Deviance standard res_DS
par(mfrow=c(2,2),pch=20,mai = c(0.1,0.15,0.1,0.1),mar=c(3,3,1,1),cex.axis=0.6,cex.lab=0.7,mgp=c(1.5,0.3,0),oma=c(1,0,0,0),tcl=-0.4)
plot(res_PS,cex=0.3,xlab="index",ylab="Pearson Standard")
plot(prev_lin,cex=0.3,res_PS,xlab="Prevision lineaire",ylab="Pearson Standard")
plot(res_DS,cex=0.3,xlab="index",ylab="Deviance Standard")
plot(prev_lin,cex=0.3,res_DS,xlab="Prevision lineaire",ylab="Deviance Standard")
Choix de variables
<- glm(chd~sbp+ldl,data=SAheart,family=binomial)
model0 <- glm(chd~sbp+ldl+famhist+alcohol,data=SAheart,family=binomial)
model1 anova(model0,model1,test="LRT")
Analysis of Deviance Table
Model 1: chd ~ sbp + ldl
Model 2: chd ~ sbp + ldl + famhist + alcohol
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 456 548.18
2 454 522.64 2 25.545 2.838e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data(SAheart)
<- bestglm(SAheart,family=binomial,IC="BIC")
mod_sel $BestModels mod_sel
sbp tobacco ldl adiposity famhist typea obesity alcohol age Criterion
1 FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE 506.3634
2 FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE 509.2566
3 FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE 509.9861
4 FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE 510.5745
5 FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE 510.7933
<- bestglm(SAheart,family=binomial,IC="AIC")
mod_sel1 $BestModels mod_sel1
sbp tobacco ldl adiposity famhist typea obesity alcohol age Criterion
1 FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE 485.6856
2 FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE 485.9799
3 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE 486.5490
4 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE 486.6548
5 FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE 487.4435
Scoring
set.seed(1234)
<- sample(nrow(SAheart),300)
ind.app <- SAheart[ind.app,]
dapp <- SAheart[-ind.app,]
dval #Construction des modeles
<- glm(chd~tobacco+famhist,data=dapp,family=binomial)
model1 <- glm(chd~tobacco+famhist+adiposity+alcohol,
model2 data=dapp,family=binomial)
round(coef(model1),3)
(Intercept) tobacco famhistPresent
-1.784 0.140 1.095
round(coef(model2),3)
(Intercept) tobacco famhistPresent adiposity alcohol
-3.180 0.117 1.022 0.059 -0.002
<- round(predict(model1,newdata=dval,type="response"))
prev1 <- round(predict(model2,newdata=dval,type="response"))
prev2 mean(prev1!=dval$chd)
[1] 0.3395062
mean(prev2!=dval$chd)
[1] 0.3395062
set.seed(1245)
<- sample(1:10,nrow(SAheart),replace=TRUE)
bloc table(bloc)
bloc
1 2 3 4 5 6 7 8 9 10
52 39 44 62 49 36 47 38 53 42
<- data.frame(matrix(0,nrow=nrow(SAheart),ncol=2))
prev names(prev) <- c("model1","model2")
for (k in 1:10){
<- bloc==k
ind.val <- SAheart[!ind.val,]
dapp.k <- SAheart[ind.val,]
dval.k <- glm(chd~tobacco+famhist,data=dapp.k,family=binomial)
model1 <- glm(chd~tobacco+famhist+adiposity+alcohol,data=dapp.k,family=binomial)
model2 1] <- round(predict(model1,newdata=dval.k,type="response"))
prev[ind.val,2] <- round(predict(model2,newdata=dval.k,type="response"))
prev[ind.val,
}apply(sweep(prev,1,SAheart$chd,FUN="!="),2,mean)
model1 model2
0.3203463 0.3073593
<- predict(model1,newdata=dval)
score1 <- predict(model2,newdata=dval) score2
library(pROC)
<- roc(dval$chd,score1)
R1 <- roc(dval$chd,score2)
R2 plot(R1,lwd=3,legacy.axes=TRUE)
plot(R2,lwd=3,col="red",lty=2,legacy.axes=TRUE,add=TRUE)
<- c("black","red")
couleur legend("bottomright",legend=c("score1","score2"),col=couleur,lty=1:2,lwd=2,cex=0.75)
auc(R1)
Area under the curve: 0.7356
auc(R2)
Area under the curve: 0.7372
<- data.frame(matrix(0,nrow=nrow(SAheart),ncol=2))
score names(score) <- c("score1","score2")
for (k in 1:10){
<- bloc==k
ind.val <- SAheart[!ind.val,]
dapp.k <- SAheart[ind.val,]
dval.k <- glm(chd~tobacco+famhist,data=dapp.k,family=binomial)
model1 <- glm(chd~tobacco+famhist+adiposity+alcohol,data=dapp.k,family=binomial)
model2 1] <- predict(model1,newdata=dval.k)
score[ind.val,2] <- predict(model2,newdata=dval.k)
score[ind.val, }
$obs <- SAheart$chd
score<- roc(obs~score1+score2,data=score)
roc.cv <- c("black","red")
couleur mapply(plot,roc.cv,col=couleur,lty=1:2,add=c(F,T),lwd=3,legacy.axes=TRUE)
score1 score2
percent FALSE FALSE
sensitivities numeric,356 numeric,463
specificities numeric,356 numeric,463
thresholds numeric,356 numeric,463
direction "<" "<"
cases numeric,160 numeric,160
controls numeric,302 numeric,302
fun.sesp ? ?
auc 0.7159872 0.7271937
call expression expression
original.predictor numeric,462 numeric,462
original.response integer,462 integer,462
predictor numeric,462 numeric,462
response integer,462 integer,462
levels character,2 character,2
predictor.name "score1" "score2"
response.name "obs" "obs"
legend("bottomright",legend=c("score1","score2"),col=couleur,lty=1:2,lwd=2,cex=0.75)
sort(round(unlist(lapply(roc.cv,auc)),3),decreasing=TRUE)
score2 score1
0.727 0.716