xgb-tidymodel.Rmd
R-sasctl has tools to help you create all the necessary files to upload an R model successfully to SAS Viya Model Manager and be able to take advantage of its many features.
We will be using the well known home equity data set. We will be
transforming some empty character columns to the proper NA
format, and split the data
library("sasctl")
library("tidymodels")
## download and prepare data
hmeq <- read.csv("https://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv")
hmeq[hmeq == ""] <- NA
hmeq$BAD <- as.factor(hmeq$BAD)
## split data
hmeq_split <- initial_split(hmeq, prop = .8, strata = "BAD")
hmeqTrain <- training(hmeq_split)
hmeqTest <- testing(hmeq_split)
We will be using the tidymodels framework, which helps greatly simplifing the model process and embedding pre and post processing to the model. You can get more information about how to use it here.
This first model will be using a simple XGBoost model for
classification. It requires you to have the xgboost
package
installed, but you don’t have to deal if its specific features or
calling the library directly.
## set model specification
xgb_spec <- boost_tree() %>%
set_engine("xgboost") %>%
set_mode("classification")
We will also create a “recipe”, which are data pre-processing steps. In this case, whenever you score the model, it will automatically impute if any data is missing and create dummies, which is a requirement for xgboost.
## recipe for data transformation
rec <- recipe(BAD ~ . , data = hmeqTrain)
impute_rec <- rec %>%
step_impute_mode(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_impute_mean(all_numeric_predictors())
imp <- prep(impute_rec)
imp
#>
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 12
#>
#> ── Training information
#> Training data contained 4767 data points and 2069 incomplete rows.
#>
#> ── Operations
#> • Mode imputation for: REASON and JOB | Trained
#> • Dummy variables from: REASON and JOB | Trained
#> • Mean imputation for: LOAN, MORTDUE, VALUE, YOJ, DEROG, ... | Trained
Now we will combine the model and pre-processing in a workflow. It is
recommend to use a workflow because it allows you to add all the steps
required to transform the data to the required model format. Even if you
are not using and pre-processing, it you should add the model to a
workflow as a good practice, and the sasctl::codegen()
only
generates scoring code for workflow in the current version.
## creating the workflow
glm_wf <- workflow() %>%
add_model(xgb_spec) %>%
add_recipe(impute_rec)
## fitting the model and the pre-processing
xgb_fitted <- glm_wf %>%
fit(data = hmeqTrain)
xgb_fitted
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: boost_tree()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 3 Recipe Steps
#>
#> • step_impute_mode()
#> • step_dummy()
#> • step_impute_mean()
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> ##### xgb.Booster
#> raw: 51.6 Kb
#> call:
#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0,
#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1,
#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist,
#> verbose = 0, nthread = 1, objective = "binary:logistic")
#> params (as set within xgb.train):
#> eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "binary:logistic", validate_parameters = "TRUE"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.evaluation.log()
#> # of features: 16
#> niter: 15
#> nfeatures : 16
#> evaluation_log:
#> iter training_logloss
#> <num> <num>
#> 1 0.5209664
#> 2 0.4225333
#> --- ---
#> 14 0.1771564
#> 15 0.1700097
Now we will start creating all the files required to successfully upload the model to SAS Model Manager.
path <- "xgbModel/"
modelrda <- "Rxgb.rda"
dir.create(path)
#> Warning in dir.create(path): 'xgbModel' already exists
saveRDS(xgb_fitted, paste0(path, modelrda), version = 2)
We will score the whole dataset, add the partitions so we can create the Diagnostics json files. In parallel you can make any other file or processes, but SAS model manager expects these specific files to be able to execute its routines such as comparing models.
scoreddf <- predict(xgb_fitted, new_data = hmeq, type = "prob")
head(scoreddf)
#> # A tibble: 6 × 2
#> .pred_0 .pred_1
#> <dbl> <dbl>
#> 1 0.103 0.897
#> 2 0.0562 0.944
#> 3 0.0882 0.912
#> 4 0.256 0.744
#> 5 0.576 0.424
#> 6 0.177 0.823
scoreddf$partition <- NA
scoreddf[hmeq_split$in_id,]["partition"] <- 1
scoreddf[-hmeq_split$in_id,]["partition"] <- 2
scoreddf$BAD <- as.numeric(hmeq$BAD) - 1
colnames(scoreddf) <- c("P_BAD0", "P_BAD1", "partition", "BAD")
head(scoreddf)
#> # A tibble: 6 × 4
#> P_BAD0 P_BAD1 partition BAD
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.103 0.897 1 1
#> 2 0.0562 0.944 1 1
#> 3 0.0882 0.912 1 1
#> 4 0.256 0.744 1 1
#> 5 0.576 0.424 1 0
#> 6 0.177 0.823 1 1
diags <- diagnosticsJson(traindf = scoreddf[scoreddf$partition == 1,],
testdf = scoreddf[scoreddf$partition == 2,],
targetEventValue = 1,
targetName = "BAD",
targetPredicted = "P_BAD1",
path = path)
#> [1] "File written to xgbModel/dmcas_lift.json"
#> [1] "File written to xgbModel/dmcas_roc.json"
#> [1] "File written to xgbModel/dmcas_fitstat.json"
## This is not very legible, but gives an idea of all the data created
do.call(rbind, lapply(diags$fitStat$parameterMap, as.data.frame))
#> parameter type label
#> _RASE_ _RASE_ num Root Average Squared Error
#> _NObs_ _NObs_ num Sum of Frequencies
#> _GINI_ _GINI_ num Gini Coefficient
#> _GAMMA_ _GAMMA_ num Gamma
#> _formattedPartition_ _formattedPartition_ char Formatted Partition
#> _DataRole_ _DataRole_ char Data Role
#> _MCE_ _MCE_ num Misclassification Rate
#> _ASE_ _ASE_ num Average Squared Error
#> _MCLL_ _MCLL_ num Multi-Class Log Loss
#> _KS_ _KS_ num KS (Youden)
#> _KSPostCutoff_ _KSPostCutoff_ num ROC Separation
#> _DIV_ _DIV_ num Divisor for ASE
#> _TAU_ _TAU_ num Tau
#> _KSCut_ _KSCut_ num KS Cutoff
#> _C_ _C_ num Area Under ROC
#> _PartInd_ _PartInd_ num Partition Indicator
#> _TargetName_ _TargetName_ char Target Name
#> length order values preformatted
#> _RASE_ 8 7 _RASE_ FALSE
#> _NObs_ 8 4 _NObs_ FALSE
#> _GINI_ 8 12 _GINI_ FALSE
#> _GAMMA_ 8 13 _GAMMA_ FALSE
#> _formattedPartition_ 12 3 _formattedPartition_ FALSE
#> _DataRole_ 10 1 _DataRole_ FALSE
#> _MCE_ 8 8 _MCE_ FALSE
#> _ASE_ 8 5 _ASE_ FALSE
#> _MCLL_ 8 9 _MCLL_ FALSE
#> _KS_ 8 10 _KS_ FALSE
#> _KSPostCutoff_ 8 16 _KSPostCutoff_ FALSE
#> _DIV_ 8 6 _DIV_ FALSE
#> _TAU_ 8 14 _TAU_ FALSE
#> _KSCut_ 8 15 _KSCut_ FALSE
#> _C_ 8 11 _C_ FALSE
#> _PartInd_ 8 2 _PartInd_ FALSE
#> _TargetName_ 10 1 _TargetName_ FALSE
Scoring codes are required to run R and Python models in SAS Model
Manager, sasctl::codegen()
helps you create a code that
follows the expected
format for tidymodels workflows. Other models and frameworks may be
added in the future.
You will notice that the scoring code has many EM_*
and
P_<<target>><<level>>
variables.
They’re not required, but it makes very consistent on how models made in
SAS Viya UI are created. Making it easier to mix these models.
We can use the inputs
to generate alternate input
variables. In this case it is useful because we used the pre-process
step recipes::step_dummy()
which created additional dummy
variables which arend expected when scoring the model nor we want to
pollute our Model Manager with dummy variables:
LOAN
, MORTDUE
,
VALUE
, YOJ
, DEROG
,
DELINQ
, CLAGE
, NINQ
,
CLNO
, DEBTINC
, REASON_HomeImp
,
JOB_Office
, JOB_Other
,
JOB_ProfExe
, JOB_Sales
,
JOB_Self
LOAN
, MORTDUE
,
VALUE
, REASON
, JOB
,
YOJ
, DEROG
, DELINQ
,
CLAGE
, NINQ
, CLNO
,
DEBTINC
(removing the BAD
variable, of
course).
code <- codegen(xgb_fitted,
path = paste0(path, "scoreCode.R"),
inputs = colnames(hmeq)[-1],
referenceLevel = 1,
rds = modelrda)
#> File written to: xgbModel/scoreCode.R
code
#> library("tidymodels")
#> library("xgboost")
#>
#> scoreFunction <- function(LOAN, MORTDUE, VALUE, REASON, JOB, YOJ, DEROG, DELINQ, CLAGE, NINQ, CLNO, DEBTINC)
#> {
#> #output: EM_CLASSIFICATION, EM_EVENTPROBABILITY, EM_PROBABILITY, I_BAD, BAD, P_BAD0, P_BAD1
#>
#> if (!exists("sasctlRmodel"))
#> {
#> assign("sasctlRmodel", readRDS(file = paste(rdsPath, "Rxgb.rda", sep = "")), envir = .GlobalEnv)
#>
#> }
#> target_labels <- c('0', '1')
#>
#> data <- data.frame(LOAN = LOAN,
#> MORTDUE = MORTDUE,
#> VALUE = VALUE,
#> REASON = REASON,
#> JOB = JOB,
#> YOJ = YOJ,
#> DEROG = DEROG,
#> DELINQ = DELINQ,
#> CLAGE = CLAGE,
#> NINQ = NINQ,
#> CLNO = CLNO,
#> DEBTINC = DEBTINC)
#>
#> predictions <- predict(sasctlRmodel, new_data = data, type = "prob")
#>
#> boolClass <- (predictions == do.call(pmax, predictions))
#> predictions[".pred"] <- apply(boolClass, 1 , function(x) target_labels[x])
#>
#> output_list <- list(EM_CLASSIFICATION = predictions[[".pred"]],
#> EM_EVENTPROBABILITY = predictions[[".pred_1"]],
#> EM_PROBABILITY = apply(subset(predictions, select = -c(.pred)), 1, max),
#> I_BAD = predictions[[".pred"]],
#> BAD = predictions[[".pred"]],
#> P_BAD0 = predictions[[".pred_0"]],
#> P_BAD1 = predictions[[".pred_1"]]
#> )
#>
#> return(output_list)
#> }
## getting the .rda Path to be called in the function
## this is simulating what Viya passes to R when calling it
rdsPath <- path
## Calling the generated code string as code to
## create the function locally
codeExpression <- str2expression(code)
eval(codeExpression)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
## this is a helper to create the variables
## cat(paste0(colnames(hmeq)[-1], " = hmeq[, '", colnames(hmeq)[-1],"']", collapse = ",\n " ))
scoreRes <- scoreFunction(LOAN = hmeq[, 'LOAN'],
MORTDUE = hmeq[, 'MORTDUE'],
VALUE = hmeq[, 'VALUE'],
REASON = hmeq[, 'REASON'],
JOB = hmeq[, 'JOB'],
YOJ = hmeq[, 'YOJ'],
DEROG = hmeq[, 'DEROG'],
DELINQ = hmeq[, 'DELINQ'],
CLAGE = hmeq[, 'CLAGE'],
NINQ = hmeq[, 'NINQ'],
CLNO = hmeq[, 'CLNO'],
DEBTINC = hmeq[, 'DEBTINC'])
scoreRes <- as.data.frame(scoreRes)
head(scoreRes)
#> EM_CLASSIFICATION EM_EVENTPROBABILITY EM_PROBABILITY I_BAD BAD P_BAD0
#> 1 1 0.8968068 0.8968068 1 1 0.10319321
#> 2 1 0.9438310 0.9438310 1 1 0.05616896
#> 3 1 0.9117607 0.9117607 1 1 0.08823925
#> 4 1 0.7438112 0.7438112 1 1 0.25618878
#> 5 0 0.4235148 0.5764852 0 0 0.57648522
#> 6 1 0.8228945 0.8228945 1 1 0.17710546
#> P_BAD1
#> 1 0.8968068
#> 2 0.9438310
#> 3 0.9117607
#> 4 0.7438112
#> 5 0.4235148
#> 6 0.8228945
Now we create some additional files which are required to configure
SAS Model Manager when uploading the files. For the variables
specifically, should match the inputs from the model and outputs. We can
use the result from the scoring code test from the previous chunck.
Otherwise you can create a data.frame
with the expected
outputs and pass it.
write_in_out_json(hmeq[,-1], input = TRUE, path = path)
#> [1] "File written to xgbModel/inputVar.json"
#> name length type level role
#> 1 LOAN 8 decimal interval input
#> 2 MORTDUE 8 decimal interval input
#> 3 VALUE 8 decimal interval input
#> 4 REASON 7 string nominal input
#> 5 JOB 7 string nominal input
#> 6 YOJ 8 decimal interval input
#> 7 DEROG 8 decimal interval input
#> 8 DELINQ 8 decimal interval input
#> 9 CLAGE 8 decimal interval input
#> 10 NINQ 8 decimal interval input
#> 11 CLNO 8 decimal interval input
#> 12 DEBTINC 8 decimal interval input
write_in_out_json(scoreRes, input = FALSE, path = path)
#> [1] "File written to xgbModel/outputVar.json"
#> name length type level role
#> 1 EM_CLASSIFICATION 1 string nominal output
#> 2 EM_EVENTPROBABILITY 8 decimal interval output
#> 3 EM_PROBABILITY 8 decimal interval output
#> 4 I_BAD 1 string nominal output
#> 5 BAD 1 string nominal output
#> 6 P_BAD0 8 decimal interval output
#> 7 P_BAD1 8 decimal interval output
write_fileMetadata_json(scoreCodeName = "scoreCode.R",
scoreResource = modelrda,
path = path)
#> [1] "File written to xgbModel/fileMetadata.json"
#> role name
#> 1 inputVariables inputVar.json
#> 2 outputVariables outputVar.json
#> 3 score scoreCode.R
#> 4 scoreResource Rxgb.rda
write_ModelProperties_json(modelName = "Rxgb",
modelFunction = "Classification",
trainTable = "hmeq",
algorithm = "XGBoost",
numTargetCategories = 2,
targetEvent = "1",
targetVariable = "BAD",
eventProbVar = "P_BAD1",
modeler = "sasctl man",
path = path)
#> [1] "File written to xgbModel/ModelProperties.json"
#> value
#> name Rxgb
#> description R model
#> function Classification
#> scoreCodeType R
#> trainTable hmeq
#> trainCodeType R
#> algorithm XGBoost
#> targetVariable BAD
#> targetEvent 1
#> targetLevel Binary
#> eventProbVar P_BAD1
#> modeler sasctl man
#> tool R
#> toolVersion 4.4.2
files_to_zip <- list.files(path, "*.json|*.R|*.rda", full.names = T)
### grouping all the files to a single zip
zip(paste0(path, "Rmodel.zip"),
files = files_to_zip)
Finally, we can upload the model to our SAS Viya server.
sess <- sasctl::session("https://viya.server.com",
username = "username",
password = "s3cr3t!")
mod <- register_model(
session = sess,
file = paste0(path, "Rmodel.zip"),
name = "Rxgb",
type = "zip",
project = "R_sasctl",
force = TRUE
)
## deleting the files locally
unlink(path, recursive = TRUE)