Getting Started with {tidysdm}

library(tidysdm)
#> Loading required package: parsnip
#> Loading required package: recipes
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#> 
#>     step
set.seed(112233)

tidysdm is an offshoot of the ENMTools package, and will eventually take over that package’s modelling functions. tidysdm is a package to make it easy to fit Species Distribution Models in a tidymodels framework. We will use data from the ENMTools package to demonstrate how tidysdm works.

library(ENMTools)
#> Loading required package: dismo
#> Loading required package: raster
#> Loading required package: sp
#> 
#> Attaching package: 'raster'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats   1.0.0     ✔ readr     2.1.5
#> ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
#> ✔ lubridate 1.9.4     ✔ tibble    3.2.1
#> ✔ purrr     1.0.2     ✔ tidyr     1.3.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ tidyr::extract() masks raster::extract()
#> ✖ dplyr::filter()  masks stats::filter()
#> ✖ stringr::fixed() masks recipes::fixed()
#> ✖ dplyr::lag()     masks stats::lag()
#> ✖ raster::select() masks dplyr::select()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
#> Registered S3 method overwritten by 'yardstick':
#>   method       from         
#>   print.metric spatstat.geom
#> ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
#> ✔ broom        1.0.7     ✔ tune         1.2.1
#> ✔ dials        1.3.0     ✔ workflows    1.1.4
#> ✔ infer        1.0.7     ✔ workflowsets 1.1.0
#> ✔ modeldata    1.4.0     ✔ yardstick    1.3.2
#> ✔ rsample      1.2.1     
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> ✖ scales::discard()  masks purrr::discard()
#> ✖ tidyr::extract()   masks raster::extract()
#> ✖ dplyr::filter()    masks stats::filter()
#> ✖ stringr::fixed()   masks recipes::fixed()
#> ✖ dplyr::lag()       masks stats::lag()
#> ✖ raster::select()   masks dplyr::select()
#> ✖ yardstick::spec()  masks readr::spec()
#> ✖ recipes::step()    masks stats::step()
#> ✖ dials::threshold() masks dismo::threshold()
#> ✖ raster::update()   masks recipes::update(), stats::update()
#> • Search for functions across packages at https://www.tidymodels.org/find/
library(spatialsample)
data("iberolacerta.clade")
#> Warning in data("iberolacerta.clade"): data set 'iberolacerta.clade' not found
data("euro.worldclim")
#> Warning in data("euro.worldclim"): data set 'euro.worldclim' not found
monticola <- iberolacerta.clade$species$monticola

We start by generating a {tidymodels} compatable sf object for the species we want to model.

dat <- sdm_data(monticola$presence.points,
                bg = monticola$range,
                n = 5000,
                coords = c("Longitude",
                           "Latitude"),
                crs = 4326)

dat
#> Simple feature collection with 5260 features and 2 fields
#> Geometry type: POINT
#> Dimension:     XY
#> Bounding box:  xmin: -9.326782 ymin: 39.83402 xmax: 0.6652058 ymax: 43.83249
#> Geodetic CRS:  BOUNDCRS[
#>     SOURCECRS[
#>         GEOGCRS["unknown",
#>             DATUM["World Geodetic System 1984",
#>                 ELLIPSOID["WGS 84",6378137,298.257223563,
#>                     LENGTHUNIT["metre",1]],
#>                 ID["EPSG",6326]],
#>             PRIMEM["Greenwich",0,
#>                 ANGLEUNIT["degree",0.0174532925199433],
#>                 ID["EPSG",8901]],
#>             CS[ellipsoidal,2],
#>                 AXIS["longitude",east,
#>                     ORDER[1],
#>                     ANGLEUNIT["degree",0.0174532925199433,
#>                         ID["EPSG",9122]]],
#>                 AXIS["latitude",north,
#>                     ORDER[2],
#>                     ANGLEUNIT["degree",0.0174532925199433,
#>                         ID["EPSG",9122]]]]],
#>     TARGETCRS[
#>         GEOGCRS["WGS 84",
#>             DATUM["World Geodetic System 1984",
#>                 ELLIPSOID["WGS 84",6378137,298.257223563,
#>                     LENGTHUNIT["metre",1]]],
#>             PRIMEM["Greenwich",0,
#>                 ANGLEUNIT["degree",0.0174532925199433]],
#>             CS[ellipsoidal,2],
#>                 AXIS["geodetic latitude (Lat)",north,
#>                     ORDER[1],
#>                     ANGLEUNIT["degree",0.0174532925199433]],
#>                 AXIS["geodetic longitude (Lon)",east,
#>                     ORDER[2],
#>                     ANGLEUNIT["degree",0.0174532925199433]],
#>             ID["EPSG",4326]]],
#>     ABRIDGEDTRANSFORMATION["Transformation from unknown to WGS84",
#>         METHOD["Geocentric translations (geog2D domain)",
#>             ID["EPSG",9603]],
#>         PARAMETER["X-axis translation",0,
#>             ID["EPSG",8605]],
#>         PARAMETER["Y-axis translation",0,
#>             ID["EPSG",8606]],
#>         PARAMETER["Z-axis translation",0,
#>             ID["EPSG",8607]]]]
#> First 10 features:
#>    present pnt_origin                       pnts
#> 1  present       data POINT (-5.171215 43.06957)
#> 2  present       data POINT (-6.036635 43.02531)
#> 3  present       data POINT (-7.679727 40.38852)
#> 4  present       data POINT (-7.790437 40.30959)
#> 5  present       data  POINT (-7.47334 43.78935)
#> 6  present       data  POINT (-6.575039 42.9107)
#> 7  present       data POINT (-5.132756 43.49572)
#> 8  present       data POINT (-7.787378 40.39362)
#> 9  present       data  POINT (-4.941888 43.3531)
#> 10 present       data  POINT (-7.621731 40.3417)

plot(dat %>% 
       dplyr::select(present) %>% 
       arrange(present), 
     pch = 19)

Now we can use the {spatialsample} package to create spatial cross validation folds! We will first create a regular cross validation fold object for comparison. Note that this can take awhile because a spatial distance is calculated between all points!

## regular CV
cv_folds <- vfold_cv(dat, 9)

## spatial CV
cv_folds_spat <- spatial_block_cv(dat, method = "snake",
                             n = c(6, 4),
                             v = 9,
                             buffer = 50000)
cv_folds_spat
#> #  9-fold spatial block cross-validation 
#> # A tibble: 9 × 2
#>   splits              id   
#>   <list>              <chr>
#> 1 <split [3965/453]>  Fold1
#> 2 <split [4018/448]>  Fold2
#> 3 <split [3879/807]>  Fold3
#> 4 <split [4221/512]>  Fold4
#> 5 <split [4465/334]>  Fold5
#> 6 <split [4365/119]>  Fold6
#> 7 <split [3557/649]>  Fold7
#> 8 <split [3367/844]>  Fold8
#> 9 <split [3193/1093]> Fold9


## look at the spatial folds
autoplot(cv_folds_spat)


autoplot(cv_folds_spat$splits[[2]])

autoplot(cv_folds_spat$splits[[8]])

A faster spatial cross validation specifically designed for presence-only data is included in {tidysdm}. It is faster because it first divides the space up into a grid, and then uses the grid cells in the cross validation instead of individual points. All point in each grid cell get ‘dragged along’ with the cell as it gets shuffled into the analysis or assessment set. This way the function does not have to calculate pairwise distance for all points, just for the grid cells. It also reduces the chances of getting analysis sets with only presences or only absences. Let’s try it now.

## presence only (po) spatial CV
cv_folds_spat <- po_spatial_buffer_vfold_cv(dat, presence = "present", n = c(24, 16),
                                             v = 9)

## look at the spatial folds
autoplot(cv_folds_spat)


autoplot(cv_folds_spat$splits[[2]])

autoplot(cv_folds_spat$splits[[8]])

Let’s create a recipe to apply some common data processing steps to prepare for SDM fitting. We start with step_add_env_vars(), which is a {tidymodels} function. The rest are standard steps from the {recipes} package. step_impute_median() imputes missing values (since we will be doing a random forest model which cannot handle missing values). step_YeoJohnson() does a Yeo-Johnson transformation on the predictors, which makes them more symmetric and ‘Gaussian-like’. It also saves the parameters used in the transformation which will be automatically applied to any test data used for prediction later. Finally step_normalize() transforms predictors to have a mean of zero and a standard deviation of 1, so that they are all on the same scale. It also saves the means and sds to be applied to test data. To see the result of the step we run prep() and bake(new_data = NULL).

sdm_recipe <- recipe(dat) %>%
  step_add_env_vars(env = euro.worldclim) %>%
  step_impute_median(all_predictors()) %>%
  step_YeoJohnson(all_predictors()) %>%
  step_normalize(all_predictors())

test <- prep(sdm_recipe) %>%
  bake(NULL)

test
#> # A tibble: 5,260 × 22
#>    present                 pnts pnt_origin   bio1    bio2   bio3     bio4
#>    <fct>            <POINT [°]> <fct>       <dbl>   <dbl>  <dbl>    <dbl>
#>  1 present (-5.171215 43.06957) data       -1.49   0.143   0.769 -0.342  
#>  2 present (-6.036635 43.02531) data       -1.55   0.143   0.769 -0.347  
#>  3 present (-7.679727 40.38852) data        1.06  -0.356  -0.229 -0.0537 
#>  4 present (-7.790437 40.30959) data        0.637 -0.786  -1.21  -0.0366 
#>  5 present  (-7.47334 43.78935) data        1.23  -1.46    1.79  -1.60   
#>  6 present  (-6.575039 42.9107) data       -1.29   0.0548  0.769 -0.258  
#>  7 present (-5.132756 43.49572) data        0.847 -1.36    1.28  -1.33   
#>  8 present (-7.787378 40.39362) data        1.06  -0.356  -0.229 -0.0537 
#>  9 present  (-4.941888 43.3531) data        0.586 -1.36    0.769 -1.27   
#> 10 present  (-7.621731 40.3417) data       -0.655 -1.46   -2.64  -0.00690
#> # ℹ 5,250 more rows
#> # ℹ 15 more variables: bio5 <dbl>, bio6 <dbl>, bio7 <dbl>, bio8 <dbl>,
#> #   bio9 <dbl>, bio10 <dbl>, bio11 <dbl>, bio12 <dbl>, bio13 <dbl>,
#> #   bio14 <dbl>, bio15 <dbl>, bio16 <dbl>, bio17 <dbl>, bio18 <dbl>,
#> #   bio19 <dbl>

Now we can setup a random forest model with {parsnip} and combine it with our recipe using a workflow from the {workflows} package..

mod <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

wf <- workflow() %>%
  add_recipe(sdm_recipe) %>%
  add_model(mod,
            formula = present ~ .)

wf
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 4 Recipe Steps
#> 
#> • step_add_env_vars()
#> • step_impute_median()
#> • step_YeoJohnson()
#> • step_normalize()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Random Forest Model Specification (classification)
#> 
#> Engine-Specific Arguments:
#>   importance = impurity
#> 
#> Computational engine: ranger

Now we fit the model! We will start with the regular cross validation folds, and use fit_resamples() to generate metrics based on the fits to each of the validation sets in our folds. Then collect_metrics() will calculate the means across folds for us.

fit_1 <- wf %>%
  fit_resamples(cv_folds, 
                control = control_resamples(extract = extract_fit_engine))

fit_1 %>%
  collect_metrics()
#> # A tibble: 3 × 6
#>   .metric     .estimator   mean     n std_err .config             
#>   <chr>       <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 accuracy    binary     0.950      9 0.00343 Preprocessor1_Model1
#> 2 brier_class binary     0.0446     9 0.00246 Preprocessor1_Model1
#> 3 roc_auc     binary     0.833      9 0.00870 Preprocessor1_Model1

Okay, so our ROC AUC value is pretty good at around 0.82. Now we do the same for the spatial cross validation folds.

fit_2 <- wf %>%
  fit_resamples(cv_folds_spat,
                control = control_resamples(extract = extract_fit_engine))

fit_2 %>%
  collect_metrics()
#> # A tibble: 3 × 6
#>   .metric     .estimator   mean     n std_err .config             
#>   <chr>       <chr>       <dbl> <int>   <dbl> <chr>               
#> 1 accuracy    binary     0.951      9 0.00512 Preprocessor1_Model1
#> 2 brier_class binary     0.0463     9 0.00424 Preprocessor1_Model1
#> 3 roc_auc     binary     0.713      9 0.0169  Preprocessor1_Model1

Using spatially independent cross validation folds has shown us our model does much more poorly if we ask it to generalise to spatial areas not in the training set. Now ROC AUC is only ~ 0.72 – considerably worse. Looking at the individual folds, there is substantial variation in the quality of models.

fit_2$.metrics
#> [[1]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.962  Preprocessor1_Model1
#> 2 roc_auc     binary        0.807  Preprocessor1_Model1
#> 3 brier_class binary        0.0350 Preprocessor1_Model1
#> 
#> [[2]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.959  Preprocessor1_Model1
#> 2 roc_auc     binary        0.706  Preprocessor1_Model1
#> 3 brier_class binary        0.0404 Preprocessor1_Model1
#> 
#> [[3]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.958  Preprocessor1_Model1
#> 2 roc_auc     binary        0.686  Preprocessor1_Model1
#> 3 brier_class binary        0.0415 Preprocessor1_Model1
#> 
#> [[4]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.964  Preprocessor1_Model1
#> 2 roc_auc     binary        0.647  Preprocessor1_Model1
#> 3 brier_class binary        0.0367 Preprocessor1_Model1
#> 
#> [[5]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.942  Preprocessor1_Model1
#> 2 roc_auc     binary        0.769  Preprocessor1_Model1
#> 3 brier_class binary        0.0532 Preprocessor1_Model1
#> 
#> [[6]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.946  Preprocessor1_Model1
#> 2 roc_auc     binary        0.722  Preprocessor1_Model1
#> 3 brier_class binary        0.0502 Preprocessor1_Model1
#> 
#> [[7]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.962  Preprocessor1_Model1
#> 2 roc_auc     binary        0.719  Preprocessor1_Model1
#> 3 brier_class binary        0.0376 Preprocessor1_Model1
#> 
#> [[8]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.916  Preprocessor1_Model1
#> 2 roc_auc     binary        0.706  Preprocessor1_Model1
#> 3 brier_class binary        0.0759 Preprocessor1_Model1
#> 
#> [[9]]
#> # A tibble: 3 × 4
#>   .metric     .estimator .estimate .config             
#>   <chr>       <chr>          <dbl> <chr>               
#> 1 accuracy    binary        0.949  Preprocessor1_Model1
#> 2 roc_auc     binary        0.656  Preprocessor1_Model1
#> 3 brier_class binary        0.0461 Preprocessor1_Model1

In the eighth fold the model looks to have done a reasonable job. Which one was that?

autoplot(cv_folds_spat$splits[[8]])

Let’s have a look at the importance values determined by the random forest for our variables.

library(vip)
#> 
#> Attaching package: 'vip'
#> The following object is masked from 'package:utils':
#> 
#>     vi
library(patchwork)
#> 
#> Attaching package: 'patchwork'
#> The following object is masked from 'package:raster':
#> 
#>     area

fit_1 %>%
  unnest(.extracts) %>%
  pull(.extracts) %>%
  map(vip) %>%
  wrap_plots(ncol = 3, nrow = 3)

The ordering is reasonably consistent between different folds. Now, the spatial folds:

fit_2 %>%
  unnest(.extracts) %>%
  pull(.extracts) %>%
  map(vip) %>%
  wrap_plots(ncol = 3, nrow = 3)

There seems to be quite a bit more variation in what variables are important between different spatial folds. Which is interesting.

We can try and improve the performance of the model on spatially independent data by using the spatial cross validation folds to tune the hyperparameters of the model. We can use the {tune} package for this. Using tune_bayes() we can use Bayesian optimization to find an optimal set. What hyperparameters should we tune. For random forest we only have three, mtry, trees, and min_n. Let’s try tuning all three. First we make a new model object where we designate these parameters for tuning, then wrap it into a new workflow.

mod_tune <- rand_forest(mtry = tune(),
                        trees = tune(),
                        min_n = tune()) %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

wf_tune <- workflow() %>%
  add_recipe(sdm_recipe) %>%
  add_model(mod_tune,
            formula = present ~ .)

wf_tune
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 4 Recipe Steps
#> 
#> • step_add_env_vars()
#> • step_impute_median()
#> • step_YeoJohnson()
#> • step_normalize()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Random Forest Model Specification (classification)
#> 
#> Main Arguments:
#>   mtry = tune()
#>   trees = tune()
#>   min_n = tune()
#> 
#> Engine-Specific Arguments:
#>   importance = impurity
#> 
#> Computational engine: ranger

Tuning is now as simple as calling tune_bayes(). First we set up an initial set of tuning models using a tuning grid (regularly spaced values of the hyper-parameters). By the way, this will take awhile. If you want to do this on your computer I would recommend setting up a parallel backend for tuning (see https://tune.tidymodels.org/articles/extras/optimizations.html).

tune_init <- wf_tune %>%
  tune_grid(cv_folds_spat,
            grid = 27,
            control = control_grid(verbose = interactive()))
#> i Creating pre-processing data to finalize unknown parameter: mtry

Now this serves as initial values for tune_bayes().

final_params <- extract_parameter_set_dials(mod_tune) %>%
  finalize(dat)

tuned <- wf_tune %>%
  tune_bayes(cv_folds_spat,
             initial = tune_init,
             iter = 100,
             param_info = final_params,
             control = control_bayes(verbose = interactive(),
                                     no_improve = 50L))

tuned %>%
  show_best()
#> Warning in show_best(.): No value of `metric` was given; "roc_auc" will be
#> used.
#> # A tibble: 5 × 10
#>    mtry trees min_n .metric .estimator  mean     n std_err .config .iter
#>   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>   <int>
#> 1     1  1073    38 roc_auc binary     0.750     9  0.0178 Iter66     66
#> 2     1  1258    37 roc_auc binary     0.749     9  0.0188 Iter55     55
#> 3     1   805    39 roc_auc binary     0.749     9  0.0210 Iter97     97
#> 4     1   661    36 roc_auc binary     0.746     9  0.0185 Iter62     62
#> 5     1   980    39 roc_auc binary     0.746     9  0.0198 Iter50     50

After all that we haven’t much improved the ability of our model to predict spatially separated testing data sets! This is not that surprising since random forest generally doesn’t need much tuning. And ultimately, the problem is not poor hyper-parameters but overfitting to spatial patterns found in the data. This cannot be prevented except by finding some way to help the model ‘account’ for spatial autocorrelation in the data. There are some approaches to doing this, which might help, but can only go so far. The problem of space is impossible to make go away completely, at least using statistical methods.

Note that the best solutions had a very low mtry. This means the random forest is only using 1 to 3 variables at a time to make predictions. This implies all variable contain mostly the same amount of information and don’t interact much, suggesting that these climate predictors have little to do with the species distribution, each is mainly being used for their spatial information. The best bet here, as in most cases is to find better predictors that are more definitely related to the species’ known ecology!

Lastly, {tidysdm} makes it easy to generate visualizable predictions on the original landscape using the create_prediction_grid() function, which creates a grid of x and y values, optionally with a polygon attached to help with plotting. Feeding this to the augment function will automatically make predictions based on the model and the bind those prediction back to the prediction grid data, ready for plotting. We will make hexagons which look much cooler than squares (just use square = FALSE).

pred_grid <- create_prediction_grid(bg = monticola$range, n = 2500, square = FALSE, include_polygons = TRUE)
#> although coordinates are longitude/latitude, st_sample assumes that they are
#> planar
plot(pred_grid$polygon)

Now to make the predictions we first do a final fit of our random forest model on the full data set, and using the best hyper-parameters from our spatial cross validation.

final_fit <- wf_tune %>%
  finalize_workflow(select_best(tuned)) %>%
  fit(dat)
#> Warning in select_best(tuned): No value of `metric` was given; "roc_auc" will
#> be used.
final_fit
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 4 Recipe Steps
#> 
#> • step_add_env_vars()
#> • step_impute_median()
#> • step_YeoJohnson()
#> • step_normalize()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Ranger result
#> 
#> Call:
#>  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~1L,      x), num.trees = ~1073L, min.node.size = min_rows(~38L, x),      importance = ~"impurity", num.threads = 1, verbose = FALSE,      seed = sample.int(10^5, 1), probability = TRUE) 
#> 
#> Type:                             Probability estimation 
#> Number of trees:                  1073 
#> Sample size:                      5260 
#> Number of independent variables:  19 
#> Mtry:                             1 
#> Target node size:                 38 
#> Variable importance mode:         impurity 
#> Splitrule:                        gini 
#> OOB prediction error (Brier s.):  0.04287625

## Does not work yet
pred_grid_preds <- final_fit %>%
  augment(pred_grid)