Remove to publish


layout: post title: “A comparison of R recoding methods” category: posts lang: en —

Some text

library(microbenchmark)
library(data.table)

set.seed(1337)
lev <- c("Red", "Blue", "Green", "Yellow", "Pink", NA)
d <- data.frame(i = 1:100000, x = sample(lev, 100000, replace = TRUE), stringsAsFactors = FALSE)
d_valid <- d
## Reference recoding, for validation
d_valid$xrec <- d$x
d_valid$xrec[d_valid$xrec %in% c("Red", "Blue")] <- "rb"
d_valid$xrec[d_valid$xrec == "Green"] <- "g"
d_valid$xrec[is.na(d_valid$xrec)] <- "NR"
d_valid$xrec[d_valid$xrec == "Pink"] <- NA
## Lookup table recoding (see adv-r)
rec_lookup <- function(d) {
  d$xrec <- d$x
  d$xrec[is.na(d$xrec)] <- "NR"
  lookup <- c("Red" = "rb", "Blue" = "rb", "Green" = "g", "Yellow" = "Yellow", "NR" = "NR", "Pink" = NA)
  d$xrec <- unname(lookup[d$xrec])
  d
}
## Indexing recoding
rec_index <- function(d) {
  d$xrec <- d$x
  d$xrec[d$xrec %in% c("Red", "Blue")] <- "rb"
  d$xrec[d$xrec == "Green"] <- "g"
  d$xrec[is.na(d$xrec)] <- "NR"  
  d$xrec[d$xrec == "Pink"] <- NA
  d
}
## Within recoding
rec_within <- function(d) {
  d <- within(d, {
    xrec <- x
    xrec[xrec %in% c("Red", "Blue")] <- "rb"
    xrec[xrec == "Green"] <- "g"
    xrec[is.na(xrec)] <- "NR"
    xrec[xrec == "Pink"] <- NA
  })
  d
}
## Factor labels recoding
rec_factor <- function(d) {
  d$xrec <- factor(d$x, exclude = NULL)
  levels(d$xrec) <- c("rb", "g", NA, "rb", "Yellow", "NB")
  d$xrec <- as.character(d$xrec)
  d
}
## Data.table recoding
rec_dt <- function(d) {
  d <- data.table(d)
  d[,xrec := x]
  d[xrec %in% c("Red", "Blue"), xrec := "rb"]
  d[xrec == "Green", xrec := "g"]
  d[is.na(xrec), xrec := "NB"]
  d[xrec == "Pink", xrec := NA]
  d
}
## Keyed data.table recoding
rec_dtk <- function(d) {
  d <- data.table(d, key = "x")
  d[, xrec := x]
  d[c("Red", "Blue"), xrec := "rb"]
  d["Green", xrec := "g"]
  d[is.na(x), xrec := "NB"]
  d["Pink", xrec := NA]
  d
}
d_lookup <- rec_lookup(d)
d_index <- rec_index(d)
d_within <- rec_within(d)
d_factor <- rec_factor(d)
d_dt <- rec_dt(d)
d_dtk <- rec_dtk(d)
all.equal(d_valid$xrec, d_lookup$xrec, d_index$xrec, d_within$xrec, d_factor$xrec, d_dt$xrec, d_dtk$xrec)
## [1] TRUE
microbenchmark(rec_lookup(d),
               rec_index(d),
               rec_within(d),
               rec_factor(d),
               rec_dt(d),
               rec_dtk(d), times = 250L)
## Unit: milliseconds
##           expr      min       lq      mean   median        uq
##  rec_lookup(d) 3.179998 3.547368  4.178579 4.114454  4.430596
##   rec_index(d) 8.684560 9.320195 10.987052 9.678278 10.066461
##  rec_within(d) 8.653899 9.152364 10.498883 9.640702  9.984397
##  rec_factor(d) 5.554221 6.089423  7.906378 6.502821  6.841976
##      rec_dt(d) 7.391148 8.541243 10.934342 8.852182  9.294913
##     rec_dtk(d) 5.260603 5.686642  6.581498 6.290361  6.550803
##       max neval cld
##  36.12042   250 a  
##  44.74959   250   c
##  43.12359   250   c
##  41.75789   250  b 
##  42.81195   250   c
##  39.96084   250  b