This benchmark was run on a MacBook Pro 2020 with the following specs
Note that the differences between the parallel and sequential timings will increase for larger objects as the overhead for starting the parallel workers and collecting the results will decrease.
It is not fully clear why the parallel approach of the {terra} package is slow than its sequential counterpart but it might relate to the single-core performance of the machine the benchmark was run on in combination with the overhead associated with starting the parallel cluster the way it is done in the {terra} package.
library(mlr3)
library(mlr3spatial)
library(future)
library(bench)
library(stars)
#> Loading required package: abind
#> Loading required package: sf
#> Linking to GEOS 3.9.1, GDAL 3.3.1, PROJ 8.1.0
library(rpart)
# SpatRaster demo stack
demo_stack_spatraster(50)
stack_terra = data.table::data.table(ID = c(0, 1), y = c("negative", "positive"))
value =::setCats(stack_terra, layer = "y", value = value)
terra names(stack_terra)
colnames = tempfile("terra", fileext = ".tif")
file_terra =::writeRaster(stack_terra, file_terra)
terra
# RasterBrick demo stack
demo_stack_rasterbrick(50)
stack_raster = names(stack_raster)
colnames_raster = tempfile("raster", fileext = ".tif")
file_raster =::writeRaster(stack_raster, file_raster)
raster#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6
# tasks
terra::rast(file_terra)
stack_terra = DataBackendRaster$new(stack_terra)
backend_terra = as_task_regr(backend_terra, target = "x_1")
task_terra =
raster::brick(file_raster)
stack_raster =names(stack_raster) = colnames_raster
DataBackendRaster$new(stack_raster)
backend_raster =#> Warning in wkt(from): no wkt comment
as_task_regr(backend_raster, target = "x_1")
task_raster =
# Train learners
set.seed(42)
sample(1:task_terra$nrow, 50)
row_ids =
lrn("regr.rpart")
learner_task_terra =$parallel_predict = TRUE
learner_task_terra$train(task_terra, row_ids = row_ids)
learner_task_terra
lrn("regr.rpart")
learner_task_raster =$parallel_predict = TRUE
learner_task_terra$train(task_raster, row_ids = row_ids)
learner_task_raster
# non-mlr3 models
rpart::rpart(x_1 ~ ., task_terra$data(rows = row_ids))
rpart_task_terra = rpart::rpart(x_1 ~ ., task_raster$data(rows = row_ids)) rpart_task_raster =
bench::mark(
bm =
"01-mlr3-terra-4-cores" = {
plan(multicore, workers = 4)
predict_spatial(task_terra, learner_task_terra, chunksize = 2000L)
},
"02-terra-4-cores" = terra::predict(stack_terra, rpart_task_terra, cores = 4, cpkgs = "rpart"),
"03-mlr3-raster-4-cores" = {
plan(multicore, workers = 4)
predict_spatial(task_raster, learner_task_raster, chunksize = 2000L, format = "raster")
},
"04-raster-4-cores" = {
library(raster)
library(rpart)
beginCluster(4, type = "PSOCK")
clusterR(stack_raster, predict, args = list(model = rpart_task_raster))
},
check = FALSE, filter_gc = FALSE, min_iterations = 3,
max_iterations = 3, memory = FALSE)
#> Loading required package: sp
#>
#> Attaching package: 'raster'
#> The following object is masked from 'package:future':
#>
#> values
#> The following object is masked from 'package:mlr3':
#>
#> resample
$`itr/sec` = NULL
bm$result = NULL
bm$`gc/sec` = NULL
bm$memory = NULL
bm$mem_alloc = NULL
bm
print(bm)
#> # A tibble: 4 × 8
#> expression min median n_itr n_gc total_time time gc
#> <bch:expr> <bch:tm> <bch:tm> <int> <dbl> <bch:tm> <list> <list>
#> 1 01-mlr3-terra-4-cores 2.44s 2.49s 3 35 7.45s <bench_tm [3]> <tibble [3 × 3]>
#> 2 02-terra-4-cores 17.16s 17.22s 3 26 52.7s <bench_tm [3]> <tibble [3 × 3]>
#> 3 03-mlr3-raster-4-cores 3.25s 3.5s 3 34 10.42s <bench_tm [3]> <tibble [3 × 3]>
#> 4 04-raster-4-cores 8.3s 8.74s 3 1 25.94s <bench_tm [3]> <tibble [3 × 3]>
library(ggplot2)
autoplot(bm, type = "ridge")
#> Loading required namespace: tidyr
#> Picking joint bandwidth of 0.00863
~ 25 Mio. values
485 MB on disk
# SpatRaster demo stack
demo_stack_spatraster(500)
stack_terra = data.table::data.table(ID = c(0, 1), y = c("negative", "positive"))
value =::setCats(stack_terra, layer = "y", value = value)
terra names(stack_terra)
colnames = tempfile("terra", fileext = ".tif")
file_terra =::writeRaster(stack_terra, file_terra)
terra
# RasterBrick demo stack
demo_stack_rasterbrick(500)
stack_raster = names(stack_raster)
colnames_raster = tempfile("raster", fileext = ".tif")
file_raster =::writeRaster(stack_raster, file_raster)
raster#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6
# tasks
terra::rast(file_terra)
stack_terra = DataBackendRaster$new(stack_terra)
backend_terra = as_task_regr(backend_terra, target = "x_1")
task_terra =
raster::brick(file_raster)
stack_raster =names(stack_raster) = colnames_raster
DataBackendRaster$new(stack_raster)
backend_raster =#> Warning in wkt(from): no wkt comment
as_task_regr(backend_raster, target = "x_1")
task_raster =
# Train learners
set.seed(42)
sample(1:task_terra$nrow, 50)
row_ids =
lrn("regr.rpart")
learner_task_terra =$parallel_predict = TRUE
learner_task_terra$train(task_terra, row_ids = row_ids)
learner_task_terra
lrn("regr.rpart")
learner_task_raster =$parallel_predict = TRUE
learner_task_terra$train(task_raster, row_ids = row_ids)
learner_task_raster
# non-mlr3 models
rpart::rpart(x_1 ~ ., task_terra$data(rows = row_ids))
rpart_task_terra = rpart::rpart(x_1 ~ ., task_raster$data(rows = row_ids)) rpart_task_raster =
bench::mark(
bm =
"01-mlr3-terra-4-cores" = {
plan(multicore, workers = 4)
predict_spatial(task_terra, learner_task_terra, chunksize = 2000L)
},
"02-terra-4-cores" = terra::predict(stack_terra, rpart_task_terra, cores = 4, cpkgs = "rpart"),
"03-mlr3-raster-4-cores" = {
plan(multicore, workers = 4)
predict_spatial(task_raster, learner_task_raster, chunksize = 2000L, format = "raster")
},
"04-raster-4-cores" = {
library(raster)
library(rpart)
beginCluster(4, type = "PSOCK")
clusterR(stack_raster, predict, args = list(model = rpart_task_raster))
},
check = FALSE, filter_gc = FALSE, min_iterations = 3,
max_iterations = 3, memory = FALSE)
$`itr/sec` = NULL
bm$result = NULL
bm$`gc/sec` = NULL
bm$memory = NULL
bm$mem_alloc = NULL
bm
print(bm)
#> # A tibble: 4 × 8
#> expression min median n_itr n_gc total_time time gc
#> <bch:expr> <bch:tm> <bch:tm> <int> <dbl> <bch:tm> <list> <list>
#> 1 01-mlr3-terra-4-cores 20.88s 21.55s 3 170 1.11m <bench_tm [3]> <tibble [3 × 3]>
#> 2 02-terra-4-cores 1.44m 1.45m 3 27 4.37m <bench_tm [3]> <tibble [3 × 3]>
#> 3 03-mlr3-raster-4-cores 30.5s 30.86s 3 38 1.62m <bench_tm [3]> <tibble [3 × 3]>
#> 4 04-raster-4-cores 24.4s 28.1s 3 1 1.35m <bench_tm [3]> <tibble [3 × 3]>
library(ggplot2)
autoplot(bm, type = "ridge")
#> Picking joint bandwidth of 0.0143