Comparing regex functions for data frames

In this vignette I show comparisons between namedCapture::df_match_variable and its closest cousin in the R package universe, tidyr::extract. The two packages can be used to compute the same result, but the code/syntax is different.

Longer more readable syntax

In this first comparison we use a syntax with each group name on the same line as its pattern. Here are some observations from the comparison:

The namedCapture code is shorter. The tidyr code is longer mostly because the for loop that you see below for tidyr is hidden inside the definition of namedCapture::df_match_variable.
Converting extracted character groups to numeric column types is specified via the convert argument of tidyr::extract, which uses utils::type.convert. Because type.convert does not know how to convert strings like 111,000 to integer, we first need to use remove.commas to create a new data.frame to use as input to tidyr::extract. In contrast namedCapture supports arbitrary group-specific type conversion functions; we specify to.int on the same line as the corresponding name/pattern for the chromStart/chromEnd groups.

## First define data.
(sacct.df <- data.frame(
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrM:111,000-222,000",
    "this will not match",
    NA, # neither will this.
    "chr1:110-111 chr2:220-222"), # two possible matches.
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))
#>                        position              JobID
#> 1 chr10:213,054,000-213,055,000        13937810_25
#> 2          chrM:111,000-222,000  13937810_25.batch
#> 3           this will not match 13937810_25.extern
#> 4                          <NA>     14022192_[1-3]
#> 5     chr1:110-111 chr2:220-222       14022204_[4]
remove.commas <- function(x)gsub(",", "", x)
long.list <- list()

## namedCapture: 29 lines of code.
range.list <- list(
  "\\[",
  task1="[0-9]+", as.integer,
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", as.integer,
  ")?", #end is optional.
  "\\]")
task.list <- list(
  "(?:",#begin alternate
  task="[0-9]+", as.integer,
  "|",#either one task(above) or range(below)
  range.list,
  ")")#end alternate
to.int <- function(x)as.integer(remove.commas(x))
(long.list$namedCapture <- namedCapture::df_match_variable(
  sacct.df,
  JobID=list(
    job="[0-9]+", as.integer,
    "_",
    task.list,
    "(?:[.]",
    type=".*",
    ")?"),
  position=list(
    chrom="chr.*?",
    ":",
    chromStart=".*?", to.int,
    "-",
    chromEnd="[0-9,]*", to.int)))
#>                        position              JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000        13937810_25  13937810         25
#> 2          chrM:111,000-222,000  13937810_25.batch  13937810         25
#> 3           this will not match 13937810_25.extern  13937810         25
#> 4                          <NA>     14022192_[1-3]  14022192         NA
#> 5     chr1:110-111 chr2:220-222       14022204_[4]  14022204         NA
#>   JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1          NA          NA                     chr10           213054000
#> 2          NA          NA      batch           chrM              111000
#> 3          NA          NA     extern           <NA>                  NA
#> 4           1           3                      <NA>                  NA
#> 5           4          NA                      chr1                 110
#>   position.chromEnd
#> 1         213055000
#> 2            222000
#> 3                NA
#> 4                NA
#> 5               111

## tidyr: 46 lines of code.
range.vec <- c(
  "\\[",
  task1="[0-9]+", 
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", 
  ")?", #end is optional.
  "\\]")
task.vec <- c(
  "(?:",#begin alternate
  task="[0-9]+", 
  "|",#either one task(above) or range(below)
  range.vec,
  ")")#end alternate
regex.list <- list(
  JobID=c(
    job="[0-9]+", 
    "_",
    task.vec,
    "(?:[.]",
    type=".*",
    ")?"),
  position=c(
    chrom="chr.*?",
    ":",
    chromStart=".*?",
    "-",
    chromEnd="[0-9,]*"))
tidyr.input <- transform(
  sacct.df,
  position=remove.commas(position))
tidyr.df.list <- list(sacct.df)
for(col.name in names(regex.list)){
  regex.vec <- regex.list[[col.name]]
  is.group <- names(regex.vec)!=""
  format.vec <- ifelse(is.group, "(%s)", "%s")
  group.vec <- sprintf(format.vec, regex.vec)
  regex <- paste(group.vec, collapse="")
  group.names <- names(regex.vec)[is.group]
  result <- tidyr::extract(
    tidyr.input, col.name, group.names, regex, convert=TRUE)
  to.save <- result[, group.names, drop=FALSE]
  names(to.save) <- paste0(col.name, ".", group.names)
  tidyr.df.list[[col.name]] <- to.save
}
names(tidyr.df.list) <- NULL
long.list$tidyr <- do.call(cbind, tidyr.df.list)

## Make sure the results are the same.
t(sapply(long.list, names))
#>              [,1]       [,2]    [,3]        [,4]         [,5]         
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> tidyr        "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#>              [,6]          [,7]         [,8]             [,9]                 
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#> tidyr        "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#>              [,10]              
#> namedCapture "position.chromEnd"
#> tidyr        "position.chromEnd"
t(sapply(long.list, sapply, class))
#>              position    JobID       JobID.job JobID.task JobID.task1
#> namedCapture "character" "character" "integer" "integer"  "integer"  
#> tidyr        "character" "character" "integer" "integer"  "integer"  
#>              JobID.taskN JobID.type  position.chrom position.chromStart
#> namedCapture "integer"   "character" "character"    "integer"          
#> tidyr        "integer"   "character" "character"    "integer"          
#>              position.chromEnd
#> namedCapture "integer"        
#> tidyr        "integer"
long.list$tidyr$JobID.type <- ifelse(
  is.na(long.list$tidyr$JobID.type),
  "",
  long.list$tidyr$JobID.type)
with(long.list, identical(tidyr, namedCapture))
#> [1] TRUE

Exercise for the reader use rematch2::bind_re_match instead of tidyr::extract (you should only have to change a few lines of code in the for loop).

Shorter less readable syntax

This second comparison uses a syntax with the entire regex on one line. In my opinion this syntax makes the regular expressions more difficult to read/understand. Complicated regular expressions like the one used for matching the JobID column are not maintainable/understandable at all using this syntax.

## First define data.
(sacct.df <- data.frame(
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrM:111,000-222,000",
    "this will not match",
    NA, # neither will this.
    "chr1:110-111 chr2:220-222"), # two possible matches.
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))
#>                        position              JobID
#> 1 chr10:213,054,000-213,055,000        13937810_25
#> 2          chrM:111,000-222,000  13937810_25.batch
#> 3           this will not match 13937810_25.extern
#> 4                          <NA>     14022192_[1-3]
#> 5     chr1:110-111 chr2:220-222       14022204_[4]
short.list <- list()

## tidyr alternate (13 lines total)
e <- function(col.name, group.names, pattern){
  result <- tidyr::extract(
    sacct.df, col.name, group.names, pattern, convert=TRUE)
  to.save <- result[, group.names, drop=FALSE]
  names(to.save) <- paste0(col.name, ".", group.names)
  to.save
}
short.list$tidyr <- do.call(cbind, list(
  sacct.df,
  e("JobID", c("job", "task", "task1", "taskN", "type"),
    "([0-9]+)_(?:([0-9]+)|\\[([0-9]+)(?:-([0-9]+))?\\])(?:[.](.*))?"),
  e("position", c("chrom", "chromStart", "chromEnd"),
    "(chr.*?):(.*?)-([0-9,]*)")))

## namedCapture alternate (7 lines total)
(short.list$namedCapture <- namedCapture::df_match_variable(
  sacct.df,
  JobID="(?P<job>[0-9]+)_(?:(?P<task>[0-9]+)|\\[(?P<task1>[0-9]+)(?:-(?P<taskN>[0-9]+))?\\])(?:[.](?P<type>.*))?",
  position="(?P<chrom>chr.*?):(?P<chromStart>.*?)-(?P<chromEnd>[0-9,]*)"))
#>                        position              JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000        13937810_25  13937810         25
#> 2          chrM:111,000-222,000  13937810_25.batch  13937810         25
#> 3           this will not match 13937810_25.extern  13937810         25
#> 4                          <NA>     14022192_[1-3]  14022192           
#> 5     chr1:110-111 chr2:220-222       14022204_[4]  14022204           
#>   JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1                                             chr10         213,054,000
#> 2                              batch           chrM             111,000
#> 3                             extern           <NA>                <NA>
#> 4           1           3                      <NA>                <NA>
#> 5           4                                  chr1                 110
#>   position.chromEnd
#> 1       213,055,000
#> 2           222,000
#> 3              <NA>
#> 4              <NA>
#> 5               111
for(N in names(short.list$namedCapture)){
  short.list$namedCapture[[N]] <- type.convert(short.list$namedCapture[[N]], as.is=TRUE)
}

## Make sure the results are the same.
t(sapply(short.list, names))
#>              [,1]       [,2]    [,3]        [,4]         [,5]         
#> tidyr        "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#>              [,6]          [,7]         [,8]             [,9]                 
#> tidyr        "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom" "position.chromStart"
#>              [,10]              
#> tidyr        "position.chromEnd"
#> namedCapture "position.chromEnd"
t(sapply(short.list, sapply, class))
#>              position    JobID       JobID.job JobID.task JobID.task1
#> tidyr        "character" "character" "integer" "integer"  "integer"  
#> namedCapture "character" "character" "integer" "integer"  "integer"  
#>              JobID.taskN JobID.type  position.chrom position.chromStart
#> tidyr        "integer"   "character" "character"    "character"        
#> namedCapture "integer"   "character" "character"    "character"        
#>              position.chromEnd
#> tidyr        "character"      
#> namedCapture "character"
short.list$tidyr$JobID.type <- ifelse(
  is.na(short.list$tidyr$JobID.type),
  "",
  short.list$tidyr$JobID.type)
with(short.list, identical(tidyr, namedCapture))
#> [1] TRUE

Comparison with rematch2

rematch2::bind_re_match is similar to tidyr::extract but additionally supports named capture regular expressions. Overall the comparison shows that both packages can use a relatively verbose and readable syntax to define complex regular expressions piece by piece:

range.list <- list(
  "\\[",
  task1="[0-9]+", as.integer,
  list(
    "-",#begin optional end of range.
    taskN="[0-9]+", as.integer
  ), "?", #end is optional.
  "\\]")
namedCapture::df_match_variable(sacct.df, JobID=range.list)
#>                        position              JobID JobID.task1 JobID.taskN
#> 1 chr10:213,054,000-213,055,000        13937810_25          NA          NA
#> 2          chrM:111,000-222,000  13937810_25.batch          NA          NA
#> 3           this will not match 13937810_25.extern          NA          NA
#> 4                          <NA>     14022192_[1-3]           1           3
#> 5     chr1:110-111 chr2:220-222       14022204_[4]           4          NA

range.pat <- paste0(
  "\\[",
  "(?<task1>[0-9]+)", 
  "(?:",
  "-",#begin optional end of range.
  "(?<taskN>[0-9]+)",
  ")?", #end is optional.
  "\\]")
rematch2::bind_re_match(sacct.df, JobID, range.pat)
#>                        position              JobID task1 taskN
#> 1 chr10:213,054,000-213,055,000        13937810_25  <NA>  <NA>
#> 2          chrM:111,000-222,000  13937810_25.batch  <NA>  <NA>
#> 3           this will not match 13937810_25.extern  <NA>  <NA>
#> 4                          <NA>     14022192_[1-3]     1     3
#> 5     chr1:110-111 chr2:220-222       14022204_[4]     4
task.list <- list(
  "_",
  list(
    task="[0-9]+", as.integer,
    "|",#either one task(above) or range(below)
    range.list))
namedCapture::df_match_variable(sacct.df, JobID=task.list)
#>                        position              JobID JobID.task JobID.task1
#> 1 chr10:213,054,000-213,055,000        13937810_25         25          NA
#> 2          chrM:111,000-222,000  13937810_25.batch         25          NA
#> 3           this will not match 13937810_25.extern         25          NA
#> 4                          <NA>     14022192_[1-3]         NA           1
#> 5     chr1:110-111 chr2:220-222       14022204_[4]         NA           4
#>   JobID.taskN
#> 1          NA
#> 2          NA
#> 3          NA
#> 4           3
#> 5          NA

task.pat <- paste0(
  "_",
  "(?:",
  "(?<task>[0-9]+)", 
  "|", #either one task(above) or range(below)
  range.pat,
  ")")
rematch2::bind_re_match(sacct.df, JobID, task.pat)
#>                        position              JobID task task1 taskN
#> 1 chr10:213,054,000-213,055,000        13937810_25   25            
#> 2          chrM:111,000-222,000  13937810_25.batch   25            
#> 3           this will not match 13937810_25.extern   25            
#> 4                          <NA>     14022192_[1-3]          1     3
#> 5     chr1:110-111 chr2:220-222       14022204_[4]          4

job.list <- list(
  job="[0-9]+", as.integer,
  task.list,
  list(
    "[.]",
    type=".*"
  ), "?")
(job.namedCapture <- namedCapture::df_match_variable(sacct.df, JobID=job.list))
#>                        position              JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000        13937810_25  13937810         25
#> 2          chrM:111,000-222,000  13937810_25.batch  13937810         25
#> 3           this will not match 13937810_25.extern  13937810         25
#> 4                          <NA>     14022192_[1-3]  14022192         NA
#> 5     chr1:110-111 chr2:220-222       14022204_[4]  14022204         NA
#>   JobID.task1 JobID.taskN JobID.type
#> 1          NA          NA           
#> 2          NA          NA      batch
#> 3          NA          NA     extern
#> 4           1           3           
#> 5           4          NA

job.pat <- paste0(
  "(?<job>[0-9]+)", 
  task.pat,
  "(?:",
  "[.]",
  "(?<type>.*)",
  ")?")
(job.rematch2 <- rematch2::bind_re_match(sacct.df, JobID, job.pat))
#>                        position              JobID      job task task1 taskN
#> 1 chr10:213,054,000-213,055,000        13937810_25 13937810   25            
#> 2          chrM:111,000-222,000  13937810_25.batch 13937810   25            
#> 3           this will not match 13937810_25.extern 13937810   25            
#> 4                          <NA>     14022192_[1-3] 14022192          1     3
#> 5     chr1:110-111 chr2:220-222       14022204_[4] 14022204          4      
#>     type
#> 1       
#> 2  batch
#> 3 extern
#> 4       
#> 5

pos.namedCapture <- namedCapture::df_match_variable(
  job.namedCapture, position=list(
    chrom="chr.*?",
    ":",
    chromStart=".*?", to.int,
    "-",
    chromEnd="[0-9,]*", to.int))
str(pos.namedCapture)
#> 'data.frame':    5 obs. of  10 variables:
#>  $ position           : chr  "chr10:213,054,000-213,055,000" "chrM:111,000-222,000" "this will not match" NA ...
#>  $ JobID              : chr  "13937810_25" "13937810_25.batch" "13937810_25.extern" "14022192_[1-3]" ...
#>  $ JobID.job          : int  13937810 13937810 13937810 14022192 14022204
#>  $ JobID.task         : int  25 25 25 NA NA
#>  $ JobID.task1        : int  NA NA NA 1 4
#>  $ JobID.taskN        : int  NA NA NA 3 NA
#>  $ JobID.type         : chr  "" "batch" "extern" "" ...
#>  $ position.chrom     : chr  "chr10" "chrM" NA NA ...
#>  $ position.chromStart: int  213054000 111000 NA NA 110
#>  $ position.chromEnd  : int  213055000 222000 NA NA 111

pos.rematch2 <- rematch2::bind_re_match(
  job.rematch2,  position, paste0(
    "(?<chrom>chr.*?)",
    ":",
    "(?<chromStart>.*?)", 
    "-",
    "(?<chromEnd>[0-9,]*)"))
str(pos.rematch2)
#> 'data.frame':    5 obs. of  10 variables:
#>  $ position  : chr  "chr10:213,054,000-213,055,000" "chrM:111,000-222,000" "this will not match" NA ...
#>  $ JobID     : chr  "13937810_25" "13937810_25.batch" "13937810_25.extern" "14022192_[1-3]" ...
#>  $ job       : chr  "13937810" "13937810" "13937810" "14022192" ...
#>  $ task      : chr  "25" "25" "25" "" ...
#>  $ task1     : chr  "" "" "" "1" ...
#>  $ taskN     : chr  "" "" "" "3" ...
#>  $ type      : chr  "" "batch" "extern" "" ...
#>  $ chrom     : chr  "chr10" "chrM" NA NA ...
#>  $ chromStart: chr  "213,054,000" "111,000" NA NA ...
#>  $ chromEnd  : chr  "213,055,000" "222,000" NA NA ...

The main difference in syntax is that group names are specified in the regular expression string literal for rematch2, whereas group names are specified as R argument names for namedCapture A difference in the result is that all columns of pos.rematch2 are character, whereas some columns of pos.namedCapture have already been converted to integer. Using rematch2 type conversion may be accomplished as a post-processing step:

converted.rematch2 <- transform(
  pos.rematch2,
  JobID.job=to.int(job),
  JobID.task1=to.int(task1),
  JobID.taskN=to.int(taskN),
  JobID.task=to.int(task),
  JobID.type=type,
  position.chrom=chrom,
  position.chromStart=to.int(chromStart),
  position.chromEnd=to.int(chromEnd),
  stringsAsFactors=FALSE)
some.rematch2 <- converted.rematch2[, names(pos.namedCapture)]
identical(some.rematch2, pos.namedCapture)
#> [1] TRUE

Exercise for the reader: convert all the rematch2::bind_re_match calls in this section to tidyr::extract calls.