The rxfastforest algorithm case of kaggle
苏高生,西南财经大学统计学硕士毕业,现就职于中国电信,主要负责企业存量客户大数据分析、数据建模。研究方向:机器学习,最喜欢的编程语言:R语言,没有之一。
E-mail:sugs01@outlook.com
往期回顾:
零、案例背景介绍与建模思路说明
1.背景介绍
本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。
2.建模思路
此文档采用R中的mlr包中的smote算法来处理数据类别不平衡的问题,用Microsoft R Server(专业版R)中的RevoScaleR包中rxFastForest函数进行随机森林建模。采用mlr包调用randomforest包的randomForest函数建模,进行并行运算,效率依然低下,不能满足正常工作;因此需要调用RevoScaleR包的函数,rxDForest可以进行随机森林建模,但是效率远低于rxFastForest函数,因此本文档采用rxFastForest函数。由于随机森林函数效率较低,因此此文档所读取的数据为“ http://rpubs.com/yisu/xgboost_mlr_kaggle_case_oversample ” 文档中处理后的xgb_tr3,xgb_te3数据(提取信约95%的信息增益);故而本文档直接进入建模部分,不再做数据探索与处理。
1) 读取数据;
2) 并行运算:由于rxFastForest函数可以通过设置相应参数进行并行运算,因此不再调用doParallel与foreach包进行并行运算;
3) 特征选择:本文档不再处理;
4) 调参:逐步调试rxFastForest函数的参数,并多次调试,直到满意为止;
5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建rxFastForest模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为0.829533,已超过Private Leaderboard排名第一的结果。
一、读取数据
rx_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_tr3.csv')
rx_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_te3.csv')
二、算法
1.建模准备
1)模型公式
rx_formula <- paste0(
'TARGET ~ ',
paste0(colnames(rx_tr)[1:(rx_tr_ncol - 1)], collapse = ' + '),
collapse = ''
)
2)加装包
library(mlr) ## 调用smote函数
library(parallelMap) ## 并行运算
parallelStartSocket(4)
library(pROC) ## 计算auc值
library(caret) ## 十折交叉验证
library(ggplot2) ## 调参时绘图查看参数不同值对应的auc值,以确定最优参数
2.调试parms参数中的rate与nn参数(smote处理类别不平衡)
1)rate与nn参数
grid_search <- expand.grid(
rate = seq(5, 50, 5),
nn = seq(5, 17, 2)
)
2)构建perf矩阵放置auc值
perf_rate_1 <- matrix(nrow = nrow(grid_search), ncol = 10) ## 十折交叉验证
3)十折交叉验证
set.seed(1)folds <- createFolds(y=1:rx_tr_nrow, k=10)
4)计算auc值—由于rxFastForest自动调用并行运算,因此此处使用循环
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = grid_search[j, 'rate'],
nn = grid_search[j, 'nn']
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = 500,
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_rate_1[j, i] <- rx_tr_roc$auc }}
perf_rate_1_f <- apply(perf_rate_1, 1, mean) ## 十折交叉验证的平均值
## 绘图
ggplot(data = grid_search, aes(x = rate, y = perf)) +
geom_point() +
facet_wrap(facets = ~ nn, ncol = 3)
5)结论:rate = 5, nn = 9时最优
3.继续调试parms参数中的rate参数(类别不平衡)
1)十折交叉验证
set.seed(2)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)rate参数
set.seed(2)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
3)构建perf矩阵放置auc值
set.seed(2)folds <- createFolds(y=1:rx_tr_nrow, k=10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = grid_search[j, 'rate'],
nn = grid_search[j, 'nn']
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = 500,
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_rate_2[j, i] <- rx_tr_roc$auc }}
perf_rate_2_f <- apply(perf_rate_2, 1, mean) ## 十折交叉验证平均值
## 绘图grid_search$perf <- perf_rate_2_f
ggplot(data = grid_search, aes(x = rate, y = perf)) +
geom_point()
5)结论:rate = 4, nn = 9时最优
4.调试parms参数中的numTrees参数
1)十折交叉验证
set.seed(3)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)numTrees参数
grid_search <- expand.grid(
numTrees = seq(100, 1000, 100))
3)构建perf矩阵放置auc值
perf_numTrees_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numTrees_1[j, i] <- rx_tr_roc$auc
}
}
perf_numTrees_1_f <- apply(perf_numTrees_1, 1, mean)
grid_search$perf <- perf_numTrees_1_fggplot(data = grid_search, aes(x = numTrees, y = perf)) + geom_point()
5)结论:numTrees=600时最优
5.调试parms参数中的numLeaves参数
1)十折交叉验证
set.seed(4)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)numLeaves参数
grid_search <- expand.grid(
numTrees = 600,
numLeaves = 2 ^ (5:9)
)
3)构建perf矩阵放置auc值
perf_numLeaves_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numLeaves_1[j, i] <- rx_tr_roc$auc }}
perf_numLeaves_1_f <- apply(perf_numLeaves_1, 1, mean)
grid_search$perf <- perf_numLeaves_1_f
ggplot(data = grid_search, aes(x = numLeaves, y = perf)) +
geom_point()
5)结论:numLeaves=2^7时最优
6.调试parms参数中的minSplit参数
1)十折交叉验证
set.seed(5)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)minSplit参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = seq(5, 30, 5)
)
3)构建perf矩阵放置auc值
perf_minSplit_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_minSplit_1[j, i] <- rx_tr_roc$auc
}
}
perf_minSplit_1_f <- apply(perf_minSplit_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_minSplit_1_fggplot(data = grid_search, aes(x = minSplit, y = perf)) +
geom_point()
5)结论:minSplit=25 时最优
7.调试parms参数中的exampleFraction参数
1)十折交叉验证
set.seed(6)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)exampleFraction参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = seq(.55, .9, .05)
)
3)构建perf矩阵放置auc值
perf_exampleFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_exampleFraction_1[j, i] <- rx_tr_roc$auc }}
perf_exampleFraction_1_f <- apply(perf_exampleFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_exampleFraction_1_f
ggplot(data = grid_search, aes(x = exampleFraction, y = perf)) +
geom_point()
5)结论:exampleFraction=.6 时最优[.55, .65]比较好,但是[.55, 1]区间变化不大
8.调试parms参数中的featureFraction参数
1)十折交叉验证
set.seed(7)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)featureFraction参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = seq(.5, .9, .05)
)
3)构建perf矩阵放置auc值
perf_featureFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_featureFraction_1[j, i] <- rx_tr_roc$auc }}
perf_featureFraction_1_f <- apply(perf_featureFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_featureFraction_1_f
ggplot(data = grid_search, aes(x = featureFraction, y = perf)) + geom_point()
5)结论:featureFraction=.85 时最优[.75, .9]比较好
9.调试parms参数中的splitFraction参数
1)十折交叉验证
set.seed(8)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)splitFraction参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = seq(.5, .95, .05)
)
3)构建perf矩阵放置auc值
perf_splitFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_splitFraction_1[j, i] <- rx_tr_roc$auc
}
perf_splitFraction_1_f <- apply(perf_splitFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_splitFraction_1_f
ggplot(data = grid_search, aes(x = splitFraction, y = perf)) +
geom_point()
5)结论:splitFraction=.5 时最优,但是变化细微
10.调试parms参数中的numBins参数
1)十折交叉验证
set.seed(9)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)numBins参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = seq(105, 505, 50)
)
3)构建perf矩阵放置auc值
perf_numBins_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numBins_1[j, i] <- rx_tr_roc$auc }}
perf_numBins_1_f <- apply(perf_numBins_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numBins_1_fggplot(data = grid_search, aes(x = numBins, y = perf)) +
geom_point() +
geom_smooth()
5)结论:numBins=350 时最优,但是变化细微
11.调试parms参数中的firstUsePenalty参数
1)十折交叉验证
set.seed(10)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)firstUsePenalty参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = 350,
firstUsePenalty = seq(0, 1, .2)
)
3)构建perf矩阵放置auc值
perf_firstUsePenalty_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
firstUsePenalty = grid_search[j, 'firstUsePenalty'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_firstUsePenalty_1[j, i] <- rx_tr_roc$auc
}
}
perf_firstUsePenalty_1_f <- apply(perf_firstUsePenalty_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_firstUsePenalty_1_f
ggplot(data = grid_search, aes(x = firstUsePenalty, y = perf)) +
geom_point() +
geom_smooth()
5)结论:firstUsePenalty=1.2 时最优
12.调试parms参数中的gainConfLevel参数
1)十折交叉验证
set.seed(11)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)gainConfLevel参数
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = 350,
firstUsePenalty = 1.2,
gainConfLevel = seq(.01, .1, .01)
)
3)构建perf矩阵放置auc值
perf_gainConfLevel_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
firstUsePenalty = grid_search[j, 'firstUsePenalty'],
gainConfLevel = grid_search[j, 'gainConfLevel'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_gainConfLevel_2[j, i] <- rx_tr_roc$auc }}
perf_gainConfLevel_2_f <- apply(perf_gainConfLevel_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_gainConfLevel_2_f
ggplot(data = grid_search, aes(x = gainConfLevel, y = perf)) +
geom_point() +
geom_smooth()
5)结论:gainConfLevel=0.05 时最优
13.再次调试parms参数中的numTrees参数
1)十折交叉验证
set.seed(12)folds <- createFolds(y=1:rx_tr_nrow, k=10)
2)numTrees参数
grid_search <- expand.grid(
numTrees=seq(200, 600, 100),
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = 350,
firstUsePenalty = 1.2,
gainConfLevel = .05
)
3)构建perf矩阵放置auc值
perf_numTrees_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){ for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
firstUsePenalty = grid_search[j, 'firstUsePenalty'],
gainConfLevel = grid_search[j, 'gainConfLevel'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numTrees_2[j, i] <- rx_tr_roc$auc }}
perf_numTrees_2_f <- apply(perf_numTrees_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numTrees_2_f
ggplot(data = grid_search, aes(x = numTrees, y = perf)) +
geom_point() +
geom_smooth()
5)结论:numTrees=600 时最优
结论:由于参数不发生变化,因此停止训练
三、集成学习
0)参数
set.seed(1)grid_search <- expand.grid(
numTrees=sample(550:650, 10, replace = FALSE),
numLeaves = sample(124:132, 4, replace = FALSE),
minSplit = sample(24:26, 2, replace = FALSE),
exampleFraction = sample(550:650, 10, replace = FALSE) / 1000,
featureFraction = sample(750:900, 10, replace = FALSE) / 1000,
splitFraction = sample(45:55, 5, replace = FALSE) / 100,
numBins = sample(320:380, 5, replace = FALSE),
firstUsePenalty = sample(115:125, 5, replace = FALSE) / 100,
gainConfLevel = sample(45:55, 5, replace = FALSE) / 1000)
sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)
grid_search2 <- grid_search[sample_ind, ]
rm(grid_search)
1) 放置结果
rxfastforest.pred <- list()
2)训练
for (i in 1:nrow(grid_search2)[1]){
# smote 抽样
rx_tr_task <- makeClassifTask(data = rx_tr, target = 'TARGET')
rx_tr_task_smote <- smote(
rx_tr_task,
rate = 4,
nn = 9
)
rx_tr_2 <- getTaskData(rx_tr_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_2,
numTrees = grid_search2[i, 'numTrees'],
numLeaves = grid_search2[i, 'numLeaves'],
minSplit = grid_search2[i, 'minSplit'],
exampleFraction = grid_search2[i, 'exampleFraction'],
featureFraction = grid_search2[i, 'featureFraction'],
splitFraction = grid_search2[i, 'splitFraction'],
numBins = grid_search2[i, 'numBins'],
firstUsePenalty = grid_search2[i, 'firstUsePenalty'],
gainConfLevel = grid_search2[i, 'gainConfLevel'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_te
)
rxfastforest.pred[[i]] <- rx_tr_pre$Probability.1
}
3)结果
rxfastforest.pred2 <- matrix(unlist(rxfastforest.pred), ncol = 100)
rxfastforest.pred3 <- data.frame(prob1 = apply(rxfastforest.pred2, 1, mean))
4)输出
write.csv(rxfastforest.pred3, "C:/Users/Administrator/Documents/kaggle/scs_rf/rxfastforest.pred1.csv")
公众号后台回复关键字即可学习
回复 R R语言快速入门及数据挖掘
回复 Kaggle案例 Kaggle十大案例精讲(连载中)
回复 文本挖掘 手把手教你做文本挖掘
回复 可视化 R语言可视化在商务场景中的应用
回复 大数据 大数据系列免费视频教程
回复 量化投资 张丹教你如何用R语言量化投资
回复 用户画像 京东大数据,揭秘用户画像
回复 数据挖掘 常用数据挖掘算法原理解释与应用
回复 机器学习 人工智能系列之机器学习与实践
回复 爬虫 R语言爬虫实战案例分享