Xgboost算法——Kaggle案例
苏高生,西南财经大学统计学硕士毕业,现就职于中国电信,主要负责企业存量客户大数据分析、数据建模。研究方向:机器学习,最喜欢的编程语言:R语言,没有之一。
E-mail:sugs01@outlook.com
零、案例背景介绍与建模思路说明
1.背景介绍
本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。
2.建模思路
此文档采用R中的mlr包(综合型机器学习包)调用xgboost算法进行分类。
1) 读取数据;
2) 数据探索:设置并行运算,弥补缺失值,观察数据类别是否平衡,删除常数列,取训练数据集与测试数据集都包含的字段
3) 特征选择:
I、 对数据类别不平衡进行处理(处理方法可以采用过抽样/欠抽样/集成等),本案例采用过抽样方法,mlr包对应的函数为oversample,初步确定适宜的过抽样比例;
II、 使用mlr包的generateFilterValuesData函数取95%的信息增益值;
4) 调参:逐步调试过抽样比例rate、eta、max_depth、min_child_weight、gamma、colsample_bytree等参数,并多次调试,直到满意为止;
5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建xgboost模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为.816584
一、读取数据
options(java.parameters = "-Xmx8g") ## 特征选择时使用,但是需要在加载包之前设置
library(readr)
xgb_tr1 <- read_csv("C:/Users/Administrator/kaggle/scs/train.csv")
xgb_te1 <- read_csv("C:/Users/Administrator/kaggle/scs/test.csv")
二、数据探索
1.设置并行运算
library(dplyr)
library(mlr)
library(parallelMap)
parallelStartSocket(4)
2.数据各列初步探索
summarizeColumns(xgb_tr1)
3.处理缺失值:impute missing values by mean and mode
imp_tr1 <- impute(
as.data.frame(xgb_tr1),
classes = list( integer = imputeMean(),
numeric = imputeMean()
)
)
imp_te1 <- impute( as.data.frame(xgb_te1),
classes = list(
integer = imputeMean(),
numeric = imputeMean()
)
)
4.观察训练数据类别的比例–数据类别不平衡
table(xgb_tr1$TARGET)
5.剔除数据集中的常数列
xgb_tr2 <- removeConstantFeatures(imp_tr1$data)
xgb_te2 <- removeConstantFeatures(imp_te1$data)
6.保留训练数据集与测试数据及相同的列
tr2_name <- data.frame(tr2_name = colnames(xgb_tr2))
te2_name <- data.frame(te2_name = colnames(xgb_te2))tr2_name_inner <- tr2_name %>%
inner_join(te2_name, by = c('tr2_name' = 'te2_name'))
TARGET = data.frame(TARGET = xgb_tr2$TARGET)
xgb_tr2 <- xgb_tr2[, c(tr2_name_inner$tr2_name[2:dim(tr2_name_inner)[1]])]
xgb_te2 <- xgb_te2[, c(tr2_name_inner$tr2_name[2:dim(tr2_name_inner)[1]])]
xgb_tr2 <- cbind(xgb_tr2, TARGET)
三、特征筛选–信息增益
1.构建基础任务
xgb_tr2$TARGET <- factor(xgb_tr2$TARGET)
xgb.task <- makeClassifTask(data = xgb_tr2, target = 'TARGET')
set.seed(0)
2.过抽样栅格搜索—搜索过抽样比率
##### 1)搜索栅格
grid_search <- expand.grid(
over_rate = seq(1, 30, 2))
##### 2)auc值集合
perf_overrate_1 <- numeric(length = dim(grid_search)[1])
##### 3)训练
for(i in 1:dim(grid_search)[1]){
## 过抽样任务
xgb.task.over <- oversample(xgb.task, rate = i)
## 学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1)
)
## 学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
## 模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
## 构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
## 学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc),
### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE
)
## auc值
perf_overrate_1[i] <-as.data.frame(trafoOptPath(res$opt.path))$auc.test.mean
}
##### 4)结果表,第XX个模型的auc最大
cat("Model ", which.max(perf_overrate_1), " is largest auc: ", max(perf_overrate_1), sep = "")
##### 5)auc最大的模型参数如下:
print(grid_search[which.max(perf_overrate_1), ])
结论:取过抽样比率为rate=15
3.特征选择
##### 1)特征选择
xgb.task.over <- oversample(xgb.task, rate = 15)
fv_time <- system.time(
fv <- generateFilterValuesData(
xgb.task.over,
method = c('information.gain')
)
)
##### 2)制图查看
# plotFilterValues(fv)
# plotFilterValuesGGVIS(fv)
##### 3)提取95%的信息增益
fv_data2 <- fv$data %>%
arrange(desc(information.gain)) %>%
mutate(info_gain_cul = cumsum(information.gain) / sum(information.gain))
fv_data2_filter <- fv_data2 %>% filter(info_gain_cul <= 0.9508198)
dim(fv_data2_filter)
fv_feature <- fv_data2_filter$name
xgb_tr3 <- xgb_tr2[, c(fv_feature, 'TARGET')]
xgb_te3 <- xgb_te2[, fv_feature]
##### 4)写出数据
write_csv(xgb_tr3, 'C:/Users/Documents/kaggle/scs/xgb_tr3.csv')
write_csv(xgb_te3, 'C:/Users/Documents/kaggle/scs/xgb_te3.csv')
四、调参-过抽样&欠抽样
1.构建基础任务
library(mlr)
xgb.task <- makeClassifTask(data = xgb_tr3, target = 'TARGET')
2.过抽样栅格搜索—搜索过抽样比率
##### 1)搜索栅格
grid_search <- expand.grid(
over_rate = seq(1, 30, 2)
)
##### 2)auc值集合
perf_overrate_1 <- numeric(length = dim(grid_search)[1])
##### 3)训练
for(i in 1:dim(grid_search)[1]){
## 过抽样任务
xgb.task.over <- oversample(xgb.task, rate = i)
## 学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1)
)
## 学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
## 模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
## 构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
## 学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc),
### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE
)
## auc值
perf_overrate_1[i] <- as.data.frame(trafoOptPath(res$opt.path))$auc.test.mean
}
##### 4)结果表,第XX个模型的auc最大
cat("Model ", which.max(perf_overrate_1), " is largest auc: ", max(perf_overrate_1), sep = "")
##### 5)auc最大的模型参数如下:
print(grid_search[which.max(perf_overrate_1), ])
结论:取rate = 19
3.过抽样栅格搜索—搜索学习速率
##### 1)学习任务
xgb.task.over <- oversample(xgb.task, rate = 19)
##### 2)学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = 2 ^ (-(8:1)))
)
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc),
### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE)
##### 7)auc值集合
perf_eta_1 <- as.data.frame(trafoOptPath(res$opt.path))
auc值对eta完全不敏感
4.过抽样栅格搜索—搜索树的最大深度
##### 1)学习任务
xgb.task.over <- oversample(xgb.task, rate = 19)
##### 2)学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = seq(4, 25, 1)),
makeDiscreteParam('gamma', values = 10))
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob')
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc),
### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE)
##### 7)auc值集合
perf_maxdepth_1 <- as.data.frame(trafoOptPath(res$opt.path))
plot(perf_maxdepth_1$auc.test.mean)
结论:auc仍随max_depth单调递增,但auc增长速度减缓;因此暂去max_depth = 15
5.过抽样栅格搜索—gamma
##### 1)学习任务:
xgb.task.over <- oversample(xgb.task, rate = 15)
##### 2)学习参数xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = 15),
makeDiscreteParam('min_child_weight', values = 2),
makeDiscreteParam('gamma', values = 2^(-3:3))
)
perf_gamma_1 <- numeric(length = length(xgb.ps$pars$gamma$values))
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc), ### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE
)
##### 7)auc值集合
perf_gamma_1 <- as.data.frame(trafoOptPath(res$opt.path))
结论:auc随gamma单调递减,递减幅度较小
6.过抽样栅格再次搜索—gamma
##### 1)学习任务
xgb.task.over <- oversample(xgb.task, rate = 19)
##### 2)学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = 15),
makeDiscreteParam('min_child_weight', values = 1),
makeDiscreteParam('gamma', values = seq(10, 45, by = 2))
)
perf_gamma_2 <- numeric(length = length(xgb.ps$pars$gamma$values))
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc), ### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE
)
##### 7)auc值集合
perf_gamma_2 <- as.data.frame(trafoOptPath(res$opt.path))
结论:auc随gamma单调递减,暂取gamma = 23
7.过抽样栅格搜索—搜索min_child_weight
##### 1)学习任务
xgb.task.over <- oversample(xgb.task, rate = 19)
##### 2)学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = 15),
makeDiscreteParam('gamma', values = 23),
makeDiscreteParam('min_child_weight', values = 2 ^ (0:5))
)
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob')
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc), ### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE)
##### 7)auc值集合
perf_minchildweight_1 <- as.data.frame(trafoOptPath(res$opt.path))
结论:aauc随min_child_weight增加而先上升后下降,当min_child_weight = 2时达到最大值0.9191293,
故设定min_child_weight = 2
8.过抽样栅格搜索—colsample_bytree
##### 1)学习任务
xgb.task.over <- oversample(xgb.task, rate = 19)
##### 2)学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = 15),
makeDiscreteParam('min_child_weight',values = 2),
makeDiscreteParam('gamma', values = 23),
makeDiscreteParam('colsample_bytree', values = seq(.1, 1, .1))
)
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc), ### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE
)
##### 7)auc值集合
perf_colsamplebytree_1 <- as.data.frame(trafoOptPath(res$opt.path))
结论:auc随着colsample_bytree单调递增,故设定colsample_bytree = 1
9.过抽样栅格搜索—再次搜索过抽样比率
##### 1)搜索栅格
grid_search <- expand.grid(
over_rate = seq(1, 30)
)
##### 2)auc值集合
perf_overrate <- numeric(length = dim(grid_search)[1])
##### 3)训练
for(i in 1:dim(grid_search)[1]){
## 过抽样任务
xgb.task.over <- oversample(xgb.task, rate = i)
## 学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = 15),
makeDiscreteParam('min_child_weight', values = 2),
makeDiscreteParam('gamma', values = 23),
makeDiscreteParam('colsample_bytree', values = 1)
)
## 学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
## 模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
## 构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
## 学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc), ### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE
)
## auc值
perf_overrate[i] <- as.data.frame(trafoOptPath(res$opt.path))$auc.test.mean
}
##### 4)结果表,第XX个模型的auc最大
cat("Model ", which.max(perf_overrate), " is largest auc: ", max(perf_overrate), sep = "")
##### 5)auc最大的模型参数如下:
print(grid_search[which.max(perf_overrate), ])
结论:auc随over_rate的增大而先上升后下降,峰值为0.9232057,此时over_rate = 25,
但是从plot(perf_overrate)图示可以看到设定over_rate = 18最为合适
10.过抽样栅格搜索—max_depth
##### 1)学习任务
xgb.task.over <- oversample(xgb.task, rate = 18)
##### 2)学习参数
xgb.ps <- makeParamSet(
makeDiscreteParam('eta', values = .1),
makeDiscreteParam('max_depth', values = seq(5, 29, 2)),
makeDiscreteParam('min_child_weight', values = 2),
makeDiscreteParam('gamma', values = 23),
makeDiscreteParam('colsample_bytree', values = 1)
)
##### 3)学习次数
xgb.ctrl <- makeTuneMultiCritControlGrid()
##### 4)模型描述--重复抽样设置
xgb.rdesc <- makeResampleDesc('CV', stratify = TRUE)
##### 5)构建学习器
xgb.learner = makeLearner(
'classif.xgboost',
predict.type = 'prob'
)
##### 6)学习
res <- tuneParamsMultiCrit(
learner = xgb.learner,
task = xgb.task.over,
resampling = xgb.rdesc,
par.set = xgb.ps,
measures = list(kappa, tpr, tnr, auc), ### 可以根据评估标准自行选择
control = xgb.ctrl,
show.info = TRUE)
##### 7)auc值集合
perf_maxdepth_2 <- as.data.frame(trafoOptPath(res$opt.path))
结论:auc随max_depth调递增,树深增加,计算量加大,因此设定max_depth = 17(拐点),不再增加
11.用以上参数训练多个模型集成结果
##### 0)参数
set.seed(1)grid_search <- expand.grid(
over_rate = sample(13:29, 5, replace = FALSE),
max_depth = sample(10:25, 5, replace = FALSE),
min_child_weight = sample(2:4, 2, replace = FALSE),
gamma = sample(25:40, 10, replace = FALSE),
colsample_bytree = sample(seq(.7, .95, .02), 10, replace = FALSE)
)
sample_ind <- sample(5000, 100, replace = FALSE)
xgb.pred <- list()
grid_search2 <- grid_search[sample_ind, ]
for (i in 1:nrow(grid_search2)){
##### 1)构建学习任务
xgb.task.over <- oversample(xgb.task, rate = grid_search2[i, 'over_rate'])
##### 2)设定模型参数
xgb.ps <- list(
eta = .1,
max_depth = grid_search2[i, 'max_depth'],
min_child_weight = grid_search2[i, 'min_child_weight'],
gamma = grid_search2[i, 'gamma'],
colsample_bytree = grid_search2[i, 'colsample_bytree']
)
##### 3)构建学习器
xgb.lrn.over = makeLearner(
cl = 'classif.xgboost',
predict.type = 'prob',
fix.factors.prediction = FALSE,
par.vals = xgb.ps
)
##### 4)训练模型
xgb.train.over <- train(
learner = xgb.lrn.over,
task = xgb.task.over
)
##### 5)预测
xgb.pred[[i]] <- predict(xgb.train.over, newdata = xgb_te3)
}
##### 集成预测结果
xgb.pred1 <- list()
for (i in 1:nrow(grid_search2)){
xgb.pred1[[i]] <- xgb.pred[[i]]$data$prob.1
}
xgb.pred2 <- matrix(unlist(xgb.pred1), ncol = 100)
xgb.pred3 <- data.frame(prob1 = apply(xgb.pred2, 1, mean))
##### 输出结果
write_csv(xgb.pred3, "C:/Users/Administrator/kaggle/scs/xgb.pred.over1.csv")
公众号后台回复关键字即可学习
回复 R R语言快速入门及数据挖掘
回复 Kaggle案例 Kaggle十大案例精讲(连载中)
回复 文本挖掘 手把手教你做文本挖掘
回复 可视化 R语言可视化在商务场景中的应用
回复 大数据 大数据系列免费视频教程
回复 量化投资 张丹教你如何用R语言量化投资
回复 用户画像 京东大数据,揭秘用户画像
回复 数据挖掘 常用数据挖掘算法原理解释与应用
回复 机器学习 人工智能系列之机器学习与实践
回复 爬虫 R语言爬虫实战案例分享