-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrapper.xgb.cv.continuous.r
81 lines (70 loc) · 3.28 KB
/
wrapper.xgb.cv.continuous.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(caret)
library(ggplot2)
library(pdp)
library(xgboost)
library(boot)
library(yardstick)
source(paste0(ScriptDir,"xgb.cv.importance.plot.R"))
source(paste0(ScriptDir,"xgb.cv.partial.r"))
source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r"))
source(paste0(ScriptDir,"xgb.cv.fit.scatterplot.r"))
source(paste0(ScriptDir,"xgb.cv.interaction.r"))
source(paste0(ScriptDir,"xgb.cv.makefolds.R"))
xgb.cv.continuous = function(Data,Predictors,Response,Objective = "reg:absoluteerror",Metric = "mae",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001
,Nthread = 2,MaxDepth=3,save = TRUE, Folds = NULL, Monotone = NULL,DoInteraction = TRUE)
{
CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors])
CVtrain_y = Data[,colnames(Data) == Response]
###Convert fold vector (if supplied) to list of obsrvations in each fold
###Assumes length of fold vector = nrow(Data)
K = Nfolds
FoldList = NULL
if(is.null(Folds)==FALSE)
{
K = min(Nfolds,length(unique(Folds)))
FoldList <- xgb.cv.makefolds(as.factor(Folds), K)
}
Nfolds = K
if(is.null(Monotone)==TRUE)
Monotone = rep(0,times = ncol(CVtrain_x))
cv <- xgb.cv(data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone,
max_depth = MaxDepth, eta = min(50,Nrounds),objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate,
save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE)))
Nfolds = length(cv$models)
#mean(CVtrain_y)
#mean(cv$pred)
Cor = round(cor(CVtrain_y,cv$pred),digits = 3)
if(save==TRUE)
saveRDS(cv,paste0(path,"xgb.cv.continuous.rds"))
###Print scatter plot of predicted against observed values of response
xgbm.cv.fit.scatterplot(cv$pred,Data[, colnames(Data) == Response],path)
####Use custom function to generate predictor importance bar plots
Filename = paste0(path,"PredictorImportance.png")
Names = colnames(CVtrain_x)
Filename = paste0(path,"PredictorImportance.png")
Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models
Nfolds, #number of fold models used in cross-validaton
Predictors= Names[Names%in% Predictors],#names of predictor variables
#this ensures names in right order
#for importance function
Filename)#location to print bar plot
####Use custom function to generate partial dependency plots
PartialDir = paste0(path,"PartialDependencePlots/")
dir.create(PartialDir,showWarnings = FALSE)
for(var in 1:length(Predictors))
xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response)
###Do interaction last as hstats changes model predictions somehow in partial plots
if(DoInteraction == TRUE)
Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds)
OutList = list()
Key = "Model"
OutList[[Key]] = cv
Key = "Correlation"
OutList[[Key]] = Cor
Key = "Predictor importance"
OutList[[Key]]= Importance
Key = "Interaction"
if(DoInteraction == TRUE)
OutList[[Key]] = Interaction
return(OutList)
}