DALEX with sparklyr
Wondering if DALEX will play nicely with sparklyr or if we can get this kind of information out of models & data in spark?
I guess it should (as it works with h2o). Would you provide and example of R code that creates a sparklyr model?
library('sparklyr')
library('dplyr')
sc <- spark_connect(master = "yarn-client")
mtcars
mtcars_spark <- copy_to(sc,mtcars)
parts <- sdf_random_split(mtcars_spark, train=.7, test=.3)
TRAIN <- parts$train
TEST <- parts$test
spark_random_forest_model <- ml_random_forest_regressor(TRAIN, mpg ~. )
spark_random_forest_model
TEST.scored <- ml_predict(spark_random_forest_model, TEST )
TEST.scored
## Cross-Validated Example
features_outcome <- colnames(TRAIN)
features <- features_outcome[!features_outcome %in% c('mpg')]
features_plus <- paste(features , collapse=' + ')
my_formula <- paste0('mpg ~ ' , features_plus)
param_grid <- list(
randForest = list(
num_trees = c(200, 400, 1600)
)
)
rf_pipeline <- ml_pipeline(sc) %>%
ft_r_formula(my_formula) %>%
ml_random_forest_regressor( feature_subset_strategy = "onethird",
seed = sample(1:10000, 1),
max_depth = 30,
subsampling_rate = .90 ,
uid ='randForest')
rf_cv <- ml_cross_validator(
sc,
estimator = rf_pipeline,
estimator_param_maps = param_grid,
evaluator = ml_multiclass_classification_evaluator(sc,
metric_name = "weightedPrecision"),
num_folds = 3
)
cv_model <- rf_cv %>% ml_fit(TRAIN)
TEST.scored <- ml_predict(cv_model, TEST )
TEST.scored
@pbiecek thanks in advance for any assistance!
Following the H20 example we can indeed use DALEX.
I suppose on my wish list would be to extend the methods developed to work with a spark data frame and spark models. As this is now, we are going to be limited to what data is used for the explain function.
If the spark data frame is very large then we would need to sample in order to use DALEX, breakDown.
library('sparklyr')
library('dplyr')
sc <- spark_connect(master = "yarn-client")
mtcars
mtcars_spark <- copy_to(sc,mtcars)
parts <- sdf_random_split(mtcars_spark, train=.7, test=.3)
TRAIN <- parts$train
TEST <- parts$test
spark_random_forest_model <- ml_random_forest_regressor(TRAIN, mpg ~. )
spark_random_forest_model
TEST.scored <- ml_predict(spark_random_forest_model, TEST )
TEST.scored
## Cross-Validated Example
features_outcome <- colnames(TRAIN)
features <- features_outcome[!features_outcome %in% c('mpg')]
features_plus <- paste(features , collapse=' + ')
my_formula <- paste0('mpg ~ ' , features_plus)
param_grid <- list(
randForest = list(
num_trees = c(200, 400, 1600)
)
)
rf_pipeline <- ml_pipeline(sc) %>%
ft_r_formula(my_formula) %>%
ml_random_forest_regressor( feature_subset_strategy = "onethird",
seed = sample(1:10000, 1),
max_depth = 30,
subsampling_rate = .90 ,
uid ='randForest')
rf_cv <- ml_cross_validator(
sc,
estimator = rf_pipeline,
estimator_param_maps = param_grid,
evaluator = ml_multiclass_classification_evaluator(sc,
metric_name = "weightedPrecision"),
num_folds = 3
)
cv_model <- rf_cv %>% ml_fit(TRAIN)
TEST.scored <- ml_predict(cv_model, TEST )
TEST.scored
##############
Test.local <- TEST %>% collect()
#################
# install.packages('DALEX')
# DALEX
custom_predict <- function(model, new_data) {
new_data_spark <- copy_to(sc, new_data, name="spark_temp1b3c4e6")
spark_tbl_scored <- ml_predict(model, new_data_spark)
res <- as.numeric(as.data.frame(spark_tbl_scored %>% select(prediction) %>% collect())$prediction)
dplyr::db_drop_table(sc, "spark_temp1b3c4e6")
return(res)
}
TEST.scored2 <- custom_predict(cv_model, Test.local)
library('DALEX')
## explain
explainer_spark_cv_rf <- explain(model = cv_model,
data = Test.local %>% select(-mpg),
y= Test.local$mpg,
predict_function = custom_predict,
label = 'spark cross-validated random forest')
## model performance
mp_spark_cv_rf <- model_performance(explainer_spark_cv_rf)
mp_spark_cv_rf
plot(mp_spark_cv_rf)
plot(mp_spark_cv_rf, geom='boxplot')
## variable importance
vi_spark_cv_rf <- variable_importance(explainer_spark_cv_rf)
plot(vi_spark_cv_rf)
## prediction understanding
#install.packages('breakDown')
sample_row <- Test.local[1,]
pd_spark_cv_rf <- prediction_breakdown(explainer_spark_cv_rf, observation = sample_row)
plot(pd_spark_cv_rf)
@maksymiuks what do you think about this? is it something for DALEXtra?
@koleckit agreed to work on this, @maksymiuks he may contact with you
@pbiecek sounds great!
Has this been implemented? Are there any tutorials using DALEX with H2O and sparkly?
Reopening the issue as I don't see sparklyr supported in DALEXtra nor a vignette with the use case at https://dalex.drwhy.ai
DALEX with H2O is available at https://htmlpreview.github.io/?https://github.com/ModelOriented/DALEX-docs/blob/master/vignettes/DALEX_h2o.html
If you want to add this feature, I will be happy to oversee merging the changes