15 Données déséquilibrées
library(tidyverse)
Exercice 1 (Critères pour un exemple de données déséquilibrées)
set.seed(1235) <- 500 n <- rbinom(n,1,0.05) %>% as.factor() Y
set.seed(12345) <- rbinom(n,1,0.005) %>% factor(levels=c("0","1")) P1
set.seed(123) <- rep(0,n) P2 ==1] <- rbinom(sum(Y==1),1,0.85) P2[Y==0] <- rbinom(sum(Y==0),1,0.1) P2[Y<- factor(P2,levels=c("0","1")) P2
table(P1,Y)
Y P1 0 1 0 473 24 1 3 0
table(P2,Y)
Y P2 0 1 0 432 8 1 44 16
<- table(P2,Y) T2 <- sum(T2[c(1,4)])/sum(T2) acc <- T2[2,2]/sum(T2[,2]) rec <- T2[2,2]/sum(T2[2,]) prec c(acc,rec,prec)
[1] 0.8960000 0.6666667 0.2666667
<- 2*(rec*prec)/(rec+prec) F1 F1
[1] 0.3809524
<- sum(T2[1,])/n*sum(T2[,1])/n+sum(T2[2,])/n*sum(T2[,2])/n rand <- (acc-rand)/(1-rand) kappa kappa
[1] 0.3353783
Retrouver ces indicateurs à l’aide de la fonction confusionMatrix de caret puis comparer les prévisions P1 et P2.
library(yardstick) <- data.frame(Y,P1,P2) df <- metric_set(accuracy,recall,precision,f_meas,kap) multi_metric %>% pivot_longer(-Y,names_to = "algo",values_to = "prev") %>% df group_by(algo) %>% multi_metric(truth=Y,estimate = prev,event_level = "second") %>% pivot_wider(names_from = algo,values_from = .estimate) %>% select(-2)
# A tibble: 5 × 3 .metric P1 P2 <chr> <dbl> <dbl> 1 accuracy 0.946 0.896 2 recall 0 0.667 3 precision 0 0.267 4 f_meas 0 0.381 5 kap -0.0108 0.335
Exercice 2 (Échantillonnage rétrospectif) On remarque d’abord que \(\mathbf P(\tilde y_i=1)=\mathbf P(y_i=1|s_i=1)\). De plus \[ \text{logit}\, p_\beta(x_i)=\log\frac{\mathbf P(y_i=1)}{\mathbf P(y_i=0)}\quad\text{et}\quad \text{logit}\, p_\gamma(x_i)=\log\frac{\mathbf P(y_i=1|s_i=1)}{\mathbf P(y_i=0|s_i=1)}. \] Or \[ \mathbf P(y_i=1|s_i=1)=\frac{\mathbf P(y_i=1,s_i=1)}{\mathbf P(s_i=1)}=\frac{\mathbf P(s_i=1|y_i=1)\mathbf P(y_i=1)}{\mathbf P(s_i=1)} \] et \[ \mathbf P(y_i=0|s_i=1)=\frac{\mathbf P(y_i=0,s_i=1)}{\mathbf P(s_i=1)}=\frac{\mathbf P(s_i=1|y_i=0)\mathbf P(y_i=0)}{\mathbf P(s_i=1)}. \] Donc \[ \text{logit}\, p_\gamma(x_i)=\log\frac{\mathbf P(y_i=1)}{\mathbf P(y_i=0)}+\log\frac{\mathbf P(s_i=1|y_i=1)}{\mathbf P(s_i=1|y_i=0)}=\text{logit}\,p_\beta(x_i)+\log\left(\frac{\tau_{1i}}{\tau_{0i}}\right). \]
Exercice 3 (Rééquilibrage)
<- read.csv("../donnees/dd_exo3_1.csv") %>% mutate(Y=as.factor(Y)) df1 <- read.csv("../donnees/dd_exo3_2.csv") %>% mutate(Y=as.factor(Y)) df2 <- read.csv("../donnees/dd_exo3_3.csv") %>% mutate(Y=as.factor(Y)) df3
summary(df1$Y)
0 1 559 441
summary(df2$Y)
0 1 692 308
summary(df3$Y)
0 1 842 158
ggplot(df1)+aes(x=X1,y=X2,color=Y)+geom_point()
ggplot(df2)+aes(x=X1,y=X2,color=Y)+geom_point()
ggplot(df3)+aes(x=X1,y=X2,color=Y)+geom_point()
library(caret) set.seed(123) <- createDataPartition(1:nrow(df1),p=2/3) a1 <- createDataPartition(1:nrow(df2),p=2/3) a2 <- createDataPartition(1:nrow(df3),p=2/3) a3 <- df1[a1$Resample1,] train1 <- df2[a2$Resample1,] train2 <- df3[a3$Resample1,] train3 <- df1[-a1$Resample1,] test1 <- df2[-a2$Resample1,] test2 <- df3[-a3$Resample1,] test3
<- glm(Y~.^2,data=train1,family=binomial) logit1 <- glm(Y~.^2,data=train2,family=binomial) logit2 <- glm(Y~.^2,data=train3,family=binomial) logit3 <- predict(logit1,newdata=test1,type="response") %>% p1 round() %>% as.factor() <- predict(logit2,newdata=test2,type="response") %>% p2 round() %>% as.factor() <- predict(logit3,newdata=test3,type="response") %>% p3 round() %>% as.factor()
confusionMatrix(data=p1,reference=test1$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 139 53 1 45 95 Accuracy : 0.7048 95% CI : (0.6526, 0.7534) No Information Rate : 0.5542 P-Value [Acc > NIR] : 1.282e-08 Kappa : 0.3994 Mcnemar's Test P-Value : 0.4795 Sensitivity : 0.7554 Specificity : 0.6419 Pos Pred Value : 0.7240 Neg Pred Value : 0.6786 Prevalence : 0.5542 Detection Rate : 0.4187 Detection Prevalence : 0.5783 Balanced Accuracy : 0.6987 'Positive' Class : 0
confusionMatrix(data=p2,reference=test2$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 206 52 1 27 47 Accuracy : 0.762 95% CI : (0.7125, 0.8068) No Information Rate : 0.7018 P-Value [Acc > NIR] : 0.008642 Kappa : 0.387 Mcnemar's Test P-Value : 0.006930 Sensitivity : 0.8841 Specificity : 0.4747 Pos Pred Value : 0.7984 Neg Pred Value : 0.6351 Prevalence : 0.7018 Detection Rate : 0.6205 Detection Prevalence : 0.7771 Balanced Accuracy : 0.6794 'Positive' Class : 0
confusionMatrix(data=p3,reference=test3$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 274 58 1 0 0 Accuracy : 0.8253 95% CI : (0.7801, 0.8646) No Information Rate : 0.8253 P-Value [Acc > NIR] : 0.535 Kappa : 0 Mcnemar's Test P-Value : 7.184e-14 Sensitivity : 1.0000 Specificity : 0.0000 Pos Pred Value : 0.8253 Neg Pred Value : NaN Prevalence : 0.8253 Detection Rate : 0.8253 Detection Prevalence : 1.0000 Balanced Accuracy : 0.5000 'Positive' Class : 0
On remarque que l’accuracy est meilleure pour le 3ème échantillon, contrairement à des indicateurs tels que le \(\kappa\) de Cohen ou le balanced accuracy.
library(UBL) set.seed(1243) <- RandOverClassif(Y~.,dat=train3) train3.over <- SmoteClassif(Y~.,dat=train3) train3.smote <- RandUnderClassif(Y~.,dat=train3) train3.under <- TomekClassif(Y~.,dat=train3)[[1]] train3.tomek
ggplot(train3.under)+aes(x=X1,y=X2,color=Y)+geom_point()
ggplot(train3.over)+aes(x=X1,y=X2,color=Y)+geom_point()
ggplot(train3.smote)+aes(x=X1,y=X2,color=Y)+geom_point()
ggplot(train3.tomek)+aes(x=X1,y=X2,color=Y)+geom_point()
<- glm(Y~.^2,data=train3.over,family=binomial) logit3.over <- glm(Y~.^2,data=train3.smote,family=binomial) logit3.smote <- glm(Y~.^2,data=train3.under,family=binomial) logit3.under <- glm(Y~.^2,data=train3.tomek,family=binomial) logit3.tomek <- predict(logit3.over,newdata=test3,type="response") %>% p3.over round() %>% as.factor() <- predict(logit3.smote,newdata=test3,type="response") %>% p3.smote round() %>% as.factor() <- predict(logit3.under,newdata=test3,type="response") %>% p3.under round() %>% as.factor() <- predict(logit3.tomek,newdata=test3,type="response") %>% p3.tomek round() %>% as.factor()
confusionMatrix(p3.over,test3$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 182 9 1 92 49 Accuracy : 0.6958 95% CI : (0.6432, 0.7448) No Information Rate : 0.8253 P-Value [Acc > NIR] : 1 Kappa : 0.3255 Mcnemar's Test P-Value : 3.37e-16 Sensitivity : 0.6642 Specificity : 0.8448 Pos Pred Value : 0.9529 Neg Pred Value : 0.3475 Prevalence : 0.8253 Detection Rate : 0.5482 Detection Prevalence : 0.5753 Balanced Accuracy : 0.7545 'Positive' Class : 0
confusionMatrix(p3.smote,test3$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 181 9 1 93 49 Accuracy : 0.6928 95% CI : (0.6401, 0.742) No Information Rate : 0.8253 P-Value [Acc > NIR] : 1 Kappa : 0.3217 Mcnemar's Test P-Value : <2e-16 Sensitivity : 0.6606 Specificity : 0.8448 Pos Pred Value : 0.9526 Neg Pred Value : 0.3451 Prevalence : 0.8253 Detection Rate : 0.5452 Detection Prevalence : 0.5723 Balanced Accuracy : 0.7527 'Positive' Class : 0
confusionMatrix(p3.under,test3$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 187 11 1 87 47 Accuracy : 0.7048 95% CI : (0.6526, 0.7534) No Information Rate : 0.8253 P-Value [Acc > NIR] : 1 Kappa : 0.325 Mcnemar's Test P-Value : 3.56e-14 Sensitivity : 0.6825 Specificity : 0.8103 Pos Pred Value : 0.9444 Neg Pred Value : 0.3507 Prevalence : 0.8253 Detection Rate : 0.5633 Detection Prevalence : 0.5964 Balanced Accuracy : 0.7464 'Positive' Class : 0
confusionMatrix(p3.tomek,test3$Y)
Confusion Matrix and Statistics Reference Prediction 0 1 0 274 55 1 0 3 Accuracy : 0.8343 95% CI : (0.7899, 0.8727) No Information Rate : 0.8253 P-Value [Acc > NIR] : 0.3641 Kappa : 0.0826 Mcnemar's Test P-Value : 3.305e-13 Sensitivity : 1.00000 Specificity : 0.05172 Pos Pred Value : 0.83283 Neg Pred Value : 1.00000 Prevalence : 0.82530 Detection Rate : 0.82530 Detection Prevalence : 0.99096 Balanced Accuracy : 0.52586 'Positive' Class : 0
Les indicateurs adaptés aux données déséquilibrées sont améliorés, on détecte mieux les 1 (quitte à faire plus d’erreur sur les 0).
Exercice 4 (Rééquilibrage et information de Fisher)