Introduction to santoku

Introduction

Santoku is a package for cutting data into intervals. It provides a replacement for base R’s cut() function.

Installation

To install santoku, run:

install.packages("santoku")

Basic usage

Use chop() like cut() to cut your data up:

library(santoku)
x <- runif(10, 0, 10)
(chopped <- chop(x, breaks = 0:10))
#>  [1] [4, 5)  [8, 9)  [3, 4)  [4, 5)  [7, 8)  [9, 10) [6, 7)  [8, 9)  [1, 2) 
#> [10] [4, 5) 
#> Levels: [1, 2) [3, 4) [4, 5) [6, 7) [7, 8) [8, 9) [9, 10)
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305  [4, 5)
#> 2  8.969989  [8, 9)
#> 3  3.391823  [3, 4)
#> 4  4.676785  [4, 5)
#> 5  7.057042  [7, 8)
#> 6  9.707687 [9, 10)
#> 7  6.713807  [6, 7)
#> 8  8.376589  [8, 9)
#> 9  1.086165  [1, 2)
#> 10 4.495479  [4, 5)

chop() returns a factor.

If data is beyond the limits of breaks, they will be extended automatically:

chopped <- chop(x, breaks = 3:7)
data.frame(x, chopped)
#>           x    chopped
#> 1  4.978305     [4, 5)
#> 2  8.969989 [7, 9.708]
#> 3  3.391823     [3, 4)
#> 4  4.676785     [4, 5)
#> 5  7.057042 [7, 9.708]
#> 6  9.707687 [7, 9.708]
#> 7  6.713807     [6, 7)
#> 8  8.376589 [7, 9.708]
#> 9  1.086165 [1.086, 3)
#> 10 4.495479     [4, 5)

To chop a single number into a separate category, put the number twice in breaks:

x_fives <- x
x_fives[1:5] <- 5
chopped <- chop(x_fives, c(2, 5, 5, 8))
data.frame(x_fives, chopped)
#>     x_fives    chopped
#> 1  5.000000        {5}
#> 2  5.000000        {5}
#> 3  5.000000        {5}
#> 4  5.000000        {5}
#> 5  5.000000        {5}
#> 6  9.707687 [8, 9.708]
#> 7  6.713807     (5, 8)
#> 8  8.376589 [8, 9.708]
#> 9  1.086165 [1.086, 2)
#> 10 4.495479     [2, 5)

To quickly produce a table of chopped data, use tab():

tab(1:10, c(2, 5, 8))
#>  [1, 2)  [2, 5)  [5, 8) [8, 10] 
#>       1       3       3       3

More ways to chop

To chop into fixed-width intervals, starting at the minimum value, use chop_width():

chopped <- chop_width(x, 2)
data.frame(x, chopped)
#>           x        chopped
#> 1  4.978305 [3.086, 5.086)
#> 2  8.969989 [7.086, 9.086)
#> 3  3.391823 [3.086, 5.086)
#> 4  4.676785 [3.086, 5.086)
#> 5  7.057042 [5.086, 7.086)
#> 6  9.707687 [9.086, 11.09)
#> 7  6.713807 [5.086, 7.086)
#> 8  8.376589 [7.086, 9.086)
#> 9  1.086165 [1.086, 3.086)
#> 10 4.495479 [3.086, 5.086)

To chop into exactly intervals fixed-with intervals, use chop_evenly():

chopped <- chop_evenly(x, intervals = 3)
data.frame(x, chopped)
#>           x        chopped
#> 1  4.978305  [3.96, 6.834)
#> 2  8.969989 [6.834, 9.708]
#> 3  3.391823  [1.086, 3.96)
#> 4  4.676785  [3.96, 6.834)
#> 5  7.057042 [6.834, 9.708]
#> 6  9.707687 [6.834, 9.708]
#> 7  6.713807  [3.96, 6.834)
#> 8  8.376589 [6.834, 9.708]
#> 9  1.086165  [1.086, 3.96)
#> 10 4.495479  [3.96, 6.834)

To chop into groups with a fixed number of members, use chop_n():

chopped <- chop_n(x, 4)
table(chopped)
#> chopped
#> [1.086, 4.978)  [4.978, 8.97)  [8.97, 9.708] 
#>              4              4              2

To chop into a fixed number of equal-sized groups, use chop_equally():

chopped <- chop_equally(x, groups = 5)
table(chopped)
#> chopped
#> [1.086, 4.275) [4.275, 4.858) [4.858, 6.851) [6.851, 8.495) [8.495, 9.708] 
#>              2              2              2              2              2

To chop data up by quantiles, use chop_quantiles():

chopped <- chop_quantiles(x, c(0.25, 0.5, 0.75))
data.frame(x, chopped)
#>           x     chopped
#> 1  4.978305  [25%, 50%)
#> 2  8.969989 (75%, 100%]
#> 3  3.391823   [0%, 25%)
#> 4  4.676785  [25%, 50%)
#> 5  7.057042  [50%, 75%]
#> 6  9.707687 (75%, 100%]
#> 7  6.713807  [50%, 75%]
#> 8  8.376589 (75%, 100%]
#> 9  1.086165   [0%, 25%)
#> 10 4.495479   [0%, 25%)

To chop data by standard deviations around the mean, use chop_mean_sd():

chopped <- chop_mean_sd(x)
data.frame(x, chopped)
#>           x        chopped
#> 1  4.978305  [-1 sd, 0 sd)
#> 2  8.969989   [1 sd, 2 sd)
#> 3  3.391823  [-1 sd, 0 sd)
#> 4  4.676785  [-1 sd, 0 sd)
#> 5  7.057042   [0 sd, 1 sd)
#> 6  9.707687   [1 sd, 2 sd)
#> 7  6.713807   [0 sd, 1 sd)
#> 8  8.376589   [0 sd, 1 sd)
#> 9  1.086165 [-2 sd, -1 sd)
#> 10 4.495479  [-1 sd, 0 sd)

To chop data into attractive intervals, use chop_pretty(). This selects intervals which are a multiple of 2, 5 or 10. It’s useful for producing bar plots.

chopped <- chop_pretty(x)
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305  [4, 6)
#> 2  8.969989 [8, 10)
#> 3  3.391823  [2, 4)
#> 4  4.676785  [4, 6)
#> 5  7.057042  [6, 8)
#> 6  9.707687 [8, 10)
#> 7  6.713807  [6, 8)
#> 8  8.376589 [8, 10)
#> 9  1.086165  [0, 2)
#> 10 4.495479  [4, 6)

tab_n(), tab_width(), and friends act similarly to tab(), calling the related chop_* function and then table() on the result.

tab_n(x, 4)
#> [1.086, 4.978)  [4.978, 8.97)  [8.97, 9.708] 
#>              4              4              2
tab_width(x, 2)
#> [1.086, 3.086) [3.086, 5.086) [5.086, 7.086) [7.086, 9.086) [9.086, 11.09) 
#>              1              4              2              2              1
tab_evenly(x, 5)
#>  [1.086, 2.81)  [2.81, 4.535) [4.535, 6.259) [6.259, 7.983) [7.983, 9.708] 
#>              1              2              2              2              3
tab_mean_sd(x)
#> [-2 sd, -1 sd)  [-1 sd, 0 sd)   [0 sd, 1 sd)   [1 sd, 2 sd) 
#>              1              4              3              2

You can chop dates too:

library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
y2k <- as.Date("2000-01-01") + 0:365
months <- chop_width(y2k, months(1))
table(months)
#> months
#> [2000-01-01, 2000-02-01) [2000-02-01, 2000-03-01) [2000-03-01, 2000-04-01) 
#>                       31                       29                       31 
#> [2000-04-01, 2000-05-01) [2000-05-01, 2000-06-01) [2000-06-01, 2000-07-01) 
#>                       30                       31                       30 
#> [2000-07-01, 2000-08-01) [2000-08-01, 2000-09-01) [2000-09-01, 2000-10-01) 
#>                       31                       31                       30 
#> [2000-10-01, 2000-11-01) [2000-11-01, 2000-12-01) [2000-12-01, 2001-01-01) 
#>                       31                       30                       31

Advanced usage

You can change factor labels with the labels argument:

chopped <- chop(x, c(2, 5, 8), labels = c("Lowest", "Low", "Higher", "Highest"))
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305     Low
#> 2  8.969989 Highest
#> 3  3.391823     Low
#> 4  4.676785     Low
#> 5  7.057042  Higher
#> 6  9.707687 Highest
#> 7  6.713807  Higher
#> 8  8.376589 Highest
#> 9  1.086165  Lowest
#> 10 4.495479     Low

You need as many labels as there are intervals - one fewer than length(breaks) if your data doesn’t extend beyond breaks, one more than length(breaks) if it does.

To label intervals with a dash, use lbl_dash():

chopped <- chop(x, c(2, 5, 8), labels = lbl_dash())
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305     2—5
#> 2  8.969989 8—9.708
#> 3  3.391823     2—5
#> 4  4.676785     2—5
#> 5  7.057042     5—8
#> 6  9.707687 8—9.708
#> 7  6.713807     5—8
#> 8  8.376589 8—9.708
#> 9  1.086165 1.086—2
#> 10 4.495479     2—5

To label integer data, use lbl_discrete(). It uses more informative right endpoints:

chopped  <- chop(1:10, c(2, 5, 8), labels = lbl_discrete())
chopped2 <- chop(1:10, c(2, 5, 8), labels = lbl_dash())
data.frame(x = 1:10, lbl_discrete = chopped, lbl_dash = chopped2)
#>     x lbl_discrete lbl_dash
#> 1   1            1      1—2
#> 2   2          2—4      2—5
#> 3   3          2—4      2—5
#> 4   4          2—4      2—5
#> 5   5          5—7      5—8
#> 6   6          5—7      5—8
#> 7   7          5—7      5—8
#> 8   8         8—10     8—10
#> 9   9         8—10     8—10
#> 10 10         8—10     8—10

You can customize the first or last labels:

chopped <- chop(x, c(2, 5, 8), labels = lbl_dash(first = "< 2", last = "8+"))
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305     2—5
#> 2  8.969989      8+
#> 3  3.391823     2—5
#> 4  4.676785     2—5
#> 5  7.057042     5—8
#> 6  9.707687      8+
#> 7  6.713807     5—8
#> 8  8.376589      8+
#> 9  1.086165     < 2
#> 10 4.495479     2—5

To label intervals in order use lbl_seq():

chopped <- chop(x, c(2, 5, 8), labels = lbl_seq())
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305       b
#> 2  8.969989       d
#> 3  3.391823       b
#> 4  4.676785       b
#> 5  7.057042       c
#> 6  9.707687       d
#> 7  6.713807       c
#> 8  8.376589       d
#> 9  1.086165       a
#> 10 4.495479       b

You can use numerals or even roman numerals:

chop(x, c(2, 5, 8), labels = lbl_seq("(1)"))
#>  [1] (2) (4) (2) (2) (3) (4) (3) (4) (1) (2)
#> Levels: (1) (2) (3) (4)
chop(x, c(2, 5, 8), labels = lbl_seq("i."))
#>  [1] ii.  iv.  ii.  ii.  iii. iv.  iii. iv.  i.   ii. 
#> Levels: i. ii. iii. iv.

Other labelling functions include:

By default, chop() extends breaks if necessary. If you don’t want that, set extend = FALSE:

chopped <- chop(x, c(3, 5, 7), extend = FALSE)
data.frame(x, chopped)
#>           x chopped
#> 1  4.978305  [3, 5)
#> 2  8.969989    <NA>
#> 3  3.391823  [3, 5)
#> 4  4.676785  [3, 5)
#> 5  7.057042    <NA>
#> 6  9.707687    <NA>
#> 7  6.713807  [5, 7)
#> 8  8.376589    <NA>
#> 9  1.086165    <NA>
#> 10 4.495479  [3, 5)

Data outside the range of breaks will become NA.

By default, intervals are closed on the left, i.e. they include their left endpoints. If you want right-closed intervals, set left = FALSE:

y <- 1:5
data.frame(
        y = y, 
        left_closed = chop(y, 1:5), 
        right_closed = chop(y, 1:5, left = FALSE)
      )
#>   y left_closed right_closed
#> 1 1      [1, 2)          {1}
#> 2 2      [2, 3)       (1, 2]
#> 3 3      [3, 4)       (2, 3]
#> 4 4      [4, 5)       (3, 4]
#> 5 5         {5}       (4, 5]

If you want to close off the last interval, set close_end = TRUE:

data.frame(
  y = y,
  rightmost_open = chop(y, 1:5),
  rightmost_closed   = chop(y, 1:5, close_end = TRUE)
)
#>   y rightmost_open rightmost_closed
#> 1 1         [1, 2)           [1, 2)
#> 2 2         [2, 3)           [2, 3)
#> 3 3         [3, 4)           [3, 4)
#> 4 4         [4, 5)           [4, 5]
#> 5 5            {5}           [4, 5]