Packages

library(holodeck)
library(dplyr)
library(purrr)
library(mice)

Simulating data with `sim_*()`

holodeck provides functions to simulate different kinds of data as columns in a tibble:

sim_cat() for categorical variables
sim_covar() for multivariate normal numeric data
sim_discr() for multivariate normal data with different means for levels of some grouping variable
sim_missing() for randomly introducing NAs

To simulate multivariate data you need to start with a dataframe or a tibble. Once you have a dataframe or tibble, the sim_*() functions add columns onto it.

df <- tibble(Y = rep(c("a", "b"), each = 5))

df %>% sim_covar(n_vars = 5, var = 1, cov = 0.5)
#> # A tibble: 10 x 6
#>    Y         V1      V2     V3        V4       V5
#>    <chr>  <dbl>   <dbl>  <dbl>     <dbl>    <dbl>
#>  1 a      0.127 -0.663  -0.968  0.478    -0.571  
#>  2 a      0.286  0.563  -1.31   0.230     0.551  
#>  3 a      0.793 -0.385  -0.973  0.157    -0.830  
#>  4 a      0.642 -0.517   0.847 -0.940    -0.866  
#>  5 a      1.34   0.822   1.00   1.05      0.494  
#>  6 b      1.80  -0.0702  0.696  0.722     0.703  
#>  7 b      1.27   1.50   -0.349  2.63      0.0745 
#>  8 b     -0.593 -0.117  -1.06  -0.578    -0.00294
#>  9 b      0.767 -0.764   0.305 -0.501    -0.431  
#> 10 b      0.102 -0.806  -1.30  -0.000327 -1.31

Optionally you can create a tibble with the sim_covar() or sim_cat() functions by providing them with the N argument instead of .data.

sim_covar(n_obs = 10, n_vars = 5, var = 1, cov = 0.5)
#> # A tibble: 10 x 5
#>        V1     V2     V3      V4      V5
#>     <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
#>  1  2.25   1.15   1.75   1.47    1.82  
#>  2  1.41   0.390  1.81   0.623  -0.378 
#>  3  0.260  1.51  -0.408 -0.432   2.04  
#>  4 -0.693 -0.461  0.386  0.572   0.0558
#>  5 -1.37   1.36  -0.199  0.648  -0.289 
#>  6  0.635  0.453  0.616  0.0647  0.761 
#>  7 -0.569 -0.215  1.18  -0.215  -0.186 
#>  8 -0.637 -0.208 -1.58  -0.325   0.0663
#>  9 -0.138 -1.01  -0.967  0.925  -0.684 
#> 10  0.493  0.731 -0.106  0.599   0.373

sim_cat() is a rather simple wrapper that just creates a column of categorical data. Eventually, it will be expanded to allow creation of crossed and nested factors.

sim_cat(n_obs = 10, n_groups = 2)
#> # A tibble: 10 x 1
#>    group
#>    <chr>
#>  1 a    
#>  2 a    
#>  3 a    
#>  4 a    
#>  5 a    
#>  6 b    
#>  7 b    
#>  8 b    
#>  9 b    
#> 10 b

sim_discr() simulates covarying data that differs in means between levels of some grouping variable.

df %>%
  group_by(Y) %>% 
  sim_discr(n_vars = 5, var = 1, cov = 0.1, group_means = c(1, -1))
#> # A tibble: 10 x 6
#> # Groups:   Y [2]
#>    Y         V1     V2     V3     V4      V5
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
#>  1 a     -0.250  0.742  0.692 -0.336  1.30  
#>  2 a      0.621  0.791  1.68   1.41   0.649 
#>  3 a      1.58   0.551 -0.556  1.19   1.09  
#>  4 a      1.48  -0.911  0.916  1.15   0.223 
#>  5 a      0.266  1.10   1.14  -0.312 -0.0590
#>  6 b     -0.412 -2.18  -1.91  -0.380  0.290 
#>  7 b     -0.631  0.369 -0.852 -2.12   0.116 
#>  8 b      0.574 -0.332 -2.84  -0.126 -0.917 
#>  9 b     -2.03  -0.394 -0.642 -0.737 -0.628 
#> 10 b      0.834 -1.57  -0.854 -1.13   1.07

Chaining functions with `%>%`

One advantage of the holodeck package is the ability to chain functions together to create complex data covariance structures. You can chain functions together in any order, although the sim_discr() function requires a grouping variable.

All of the sim_* functions (besides sim_missing()) take an optional name argument which names the variables created.

df <-
  sim_covar(n_obs = 20, n_vars = 5, var = 1, cov = 0.1, name = "low") %>% #5 variables with low covariance
  sim_covar(n_vars = 5, var = 1, cov = 0.8, name = "high")            #5 variables with high covariance

Now we could add a categorical variable, and some variables that discriminate between levels of our categorical variable

df1 <-
  df %>% 
  sim_cat(n_groups = 2, name = "factor") %>% 
  group_by(factor) %>% 
  sim_discr(n_vars = 5, var = 1, cov = 0.1, group_means = c(-1, 1), name = "discr") %>% 
  ungroup()

Finally, if you want to simulate missing values, you can use sim_missing() to randomly introduce NAs.

df2 <-
  df1 %>% 
  sim_missing(prop = 0.1)
df2
#> # A tibble: 20 x 16
#>    factor   low_1   low_2  low_3   low_4   low_5  high_1  high_2   high_3
#>    <chr>    <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>
#>  1 a      -0.547  -0.143   1.46  NA      -0.496  NA       0.605   4.07e-1
#>  2 a      -0.937   0.0577 -0.588 -0.0707 -1.16   -0.131  -0.359  -5.35e-1
#>  3 a      -2.27   -0.660  -1.31  -2.54   -0.503  -0.583  -1.75   -5.92e-1
#>  4 a       0.0681 -0.491  -1.90  NA      -0.450  -1.28   -1.54   -1.71e+0
#>  5 a      -2.14   -1.07   -3.12  -0.591  -1.11   -0.539  NA      -2.07e+0
#>  6 a      -0.429   0.394   0.986  1.42    1.23    1.52    1.41    9.85e-4
#>  7 a       0.135   0.666   1.90   0.982  -1.01    0.357   0.259  -4.27e-2
#>  8 a       1.47   -0.245  NA     -0.736  -0.0165  0.0763  0.614   5.37e-1
#>  9 a       0.893   1.88   -0.673  0.210   0.232  -0.521  -0.260  NA      
#> 10 a       0.507   0.600   0.150  1.19   -0.0573  0.278   0.163   4.92e-1
#> 11 b      -0.486   0.789  -0.965  0.754  -0.0955 -0.190  -0.951  -1.08e-1
#> 12 b       0.719  -1.43   -1.46  -0.915  -0.509  -0.0249 -0.514  -8.81e-1
#> 13 b      -0.221  -0.308   0.144  0.775  -1.37   -0.165   0.0132  7.57e-2
#> 14 b      -1.96    0.538   0.288  0.541  -2.35    0.242  NA      -7.20e-1
#> 15 b      -0.686  -0.909  NA      0.167  NA       0.673   1.67   NA      
#> 16 b      -0.768   0.888   0.949 -1.03   -0.958  NA      NA       4.88e-1
#> 17 b      -1.25   -1.22   NA     -1.93   NA       0.133  -0.545  -1.47e-1
#> 18 b      -0.621   0.739   1.01   0.411  -1.01    1.13    0.376   5.92e-1
#> 19 b       0.331   0.588   1.80   1.47    1.39   -0.290  -0.973  -2.88e-1
#> 20 b      NA      -0.713   0.153  1.04   -0.433  -1.12   -1.01   -1.54e-1
#> # … with 7 more variables: high_4 <dbl>, high_5 <dbl>, discr_1 <dbl>,
#> #   discr_2 <dbl>, discr_3 <dbl>, discr_4 <dbl>, discr_5 <dbl>

Visualizing covariance

cov() creates a covariance matrix with variance on the diagonal. We can visualize it as a heatmap.

library(ggplot2)
df1 %>%
  select(-factor) %>% 
  cov() %>% 
  heatmap(Rowv = NA, Colv = NA, symm = TRUE)

Values are higher for the discriminating variables because the cov and var arguments to sim_discr() only control the covariance and variance within groups.

Example: Effect of missing data

One reason to simulate multivariate data is to test the effects of different properties of datasets on analysis results. For example, what’s the effect of missing data on a statistical analysis? The sim_missing() function replaces a proportion of values with NA. Let’s see how it affects a PLS-DA analysis.

Create datasets

We can chain several sim_* functions to quickly create a dataframe.

df2 <- 
  sim_cat(n_obs = 40, n_groups = 3, name = "factor") %>% 
  sim_covar(n_vars = 3, var = 1, cov = 0.0, name = "noise") %>% 
  group_by(factor) %>% 
  sim_discr(n_vars = 5, var = 1, cov = 0, group_means = c(-1, 0, 1), name  = "signal") %>%
  sim_discr(n_vars = 5, var = 1, cov = 0, group_means = c(0, 0.5, 1), name = "signal2") %>% 
  ungroup()
df2
#> # A tibble: 40 x 14
#>    factor noise_1 noise_2 noise_3 signal_1 signal_2 signal_3 signal_4 signal_5
#>    <chr>    <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
#>  1 a        1.55   0.0639  -1.29    -0.225   -0.101  -0.754    -0.255   -1.81 
#>  2 a        0.945  0.144    0.352   -2.14    -1.66   -1.95     -2.60    -1.15 
#>  3 a        0.400 -0.890   -0.411   -0.823    0.318  -0.189    -3.33    -2.16 
#>  4 a       -0.205  1.04    -1.00    -0.314   -1.97   -1.02     -0.842   -0.796
#>  5 a        0.844  0.179    0.336    0.670   -1.37   -1.06     -2.63    -1.61 
#>  6 a       -0.628 -0.270   -1.06    -1.19    -0.505  -0.730    -0.741   -0.338
#>  7 a       -0.569 -0.450    0.855   -2.38    -1.49   -1.50      1.85    -1.11 
#>  8 a       -0.662  1.73    -0.635   -1.03     0.345   0.328    -2.10    -1.81 
#>  9 a       -1.23  -0.231   -0.979    0.335   -0.863  -0.0528   -0.815   -0.517
#> 10 a        1.06   0.693    0.857   -1.56    -0.948  -1.38      0.625   -0.487
#> # … with 30 more rows, and 5 more variables: signal2_1 <dbl>, signal2_2 <dbl>,
#> #   signal2_3 <dbl>, signal2_4 <dbl>, signal2_5 <dbl>

We can then use map() from the purrr package to create many randomly generated datasets using the same specifications, with and without missing values.

set.seed(100)
dfs <-
  map(1:20, 
      ~sim_cat(n_obs = 40, n_groups = 3, name = "factor") %>% 
        sim_covar(n_vars = 3, var = 1, cov = 0.0, name = "noise") %>% 
        group_by(factor) %>% 
        sim_discr(n_vars = 5, var = 1, cov = 0, group_means = c(-1, 0, 1), name  = "signal") %>%
        sim_discr(n_vars = 5, var = 1, cov = 0, group_means = c(0, 0.5, 1), name = "signal2") %>% 
        ungroup())

Alternatively, you could generate one large dataframe (many rows) and take subsets. Either way, you know the “true” properties of the data and can compare to the results of the analyses you test.

Simulate missing data

We can now map the sim_missing() function to randomly introduce NAs to the datasets.

set.seed(101)
dfs.missing <-
  map(dfs, ~sim_missing(., prop = 0.05))

And finally, deal with those NAs with multiple imputation with the mice package.

# this might take a few seconds
dfs.imputed <-
  map(dfs.missing, ~mice(., printFlag = FALSE) %>% complete())

Here, we can compare an example dataset as original, with NAs, and imputed:

head(dfs[[1]])
#> # A tibble: 6 x 14
#>   factor noise_1 noise_2 noise_3 signal_1 signal_2 signal_3 signal_4 signal_5
#>   <chr>    <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
#> 1 a       -1.69    0.236  -1.09  -2.82     -0.546    -0.998   -1.87    0.0570
#> 2 a        0.166   0.307  -0.968 -2.90     -1.90     -0.680   -0.184  -1.20  
#> 3 a       -0.230   0.744   0.949 -0.435     1.21     -2.46    -1.69    0.830 
#> 4 a       -1.55   -0.111   2.62   0.505     0.0921   -1.86    -0.734   0.207 
#> 5 a        0.639   0.801  -2.89   0.00805  -2.24     -1.37    -1.79   -0.832 
#> 6 a        0.997   0.446  -1.85  -0.525    -0.266    -1.40    -0.841  -0.483 
#> # … with 5 more variables: signal2_1 <dbl>, signal2_2 <dbl>, signal2_3 <dbl>,
#> #   signal2_4 <dbl>, signal2_5 <dbl>
head(dfs.missing[[1]])
#> # A tibble: 6 x 14
#>   factor noise_1 noise_2 noise_3 signal_1 signal_2 signal_3 signal_4 signal_5
#>   <chr>    <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
#> 1 a       -1.69    0.236  NA     -2.82      -0.546   -0.998   -1.87    0.0570
#> 2 a        0.166   0.307  -0.968 NA         -1.90    -0.680   -0.184  -1.20  
#> 3 a       NA       0.744   0.949 -0.435      1.21    -2.46    -1.69    0.830 
#> 4 a       -1.55   -0.111   2.62   0.505     NA       -1.86    -0.734   0.207 
#> 5 a        0.639   0.801  -2.89   0.00805   -2.24    -1.37    -1.79   -0.832 
#> 6 a        0.997   0.446  -1.85  -0.525     -0.266   -1.40    -0.841  -0.483 
#> # … with 5 more variables: signal2_1 <dbl>, signal2_2 <dbl>, signal2_3 <dbl>,
#> #   signal2_4 <dbl>, signal2_5 <dbl>
head(dfs.imputed[[1]])
#>   factor    noise_1    noise_2    noise_3     signal_1   signal_2   signal_3
#> 1      a -1.6906116  0.2355350 -0.1069054 -2.817674585 -0.5463934 -0.9981282
#> 2      a  0.1657635  0.3070560 -0.9680104 -2.817674585 -1.8966699 -0.6795347
#> 3      a -0.5654266  0.7444634  0.9490844 -0.434555093  1.2120953 -2.4590652
#> 4      a -1.5515005 -0.1112986  2.6246159  0.504865901  0.3794100 -1.8565607
#> 5      a  0.6387119  0.8013671 -2.8872108  0.008051957 -2.2352913 -1.3670493
#> 6      a  0.9972444  0.4459734 -1.8517622 -0.525304431 -0.2656965 -1.4024310
#>     signal_4    signal_5  signal2_1  signal2_2  signal2_3  signal2_4
#> 1 -1.8734533  0.05695263 -1.0783810  0.6778122  0.3422852 -1.4571034
#> 2 -0.1839402 -1.20440247 -0.6043582 -0.5410977  0.2447505 -2.1684150
#> 3 -1.6913561  0.83017336  2.4216585  1.5520072 -1.2578215  0.2991252
#> 4 -0.7342102  0.20676473  0.6113694  0.8093880 -0.1740582  0.5140016
#> 5 -1.7852787 -0.83178566  0.5550258  1.5885649 -0.6389789 -2.1684150
#> 6 -0.8410057 -0.48252823  0.4278599 -1.1773614  1.0966490 -1.2507898
#>     signal2_5
#> 1 -0.04717609
#> 2 -0.40846248
#> 3 -0.10908363
#> 4  0.97574647
#> 5  0.24133602
#> 6  0.12009805

Simulating Multivariate Data with `holodeck`

Eric R. Scott

2020-05-25

Packages

Simulating data with `sim_*()`

Chaining functions with `%>%`

Visualizing covariance

Example: Effect of missing data

Create datasets

Simulate missing data

Simulating Multivariate Data with holodeck

Eric R. Scott

2020-05-25

Packages

Simulating data with sim_*()

Chaining functions with %>%

Visualizing covariance

Example: Effect of missing data

Create datasets

Simulate missing data

Simulating Multivariate Data with `holodeck`

Simulating data with `sim_*()`

Chaining functions with `%>%`