data.table::rleid() is pretty cool!


Tom Mock


November 27, 2022

Longer example on QB Starts

Create a dataframe

# create a df of"streaks" or repeats
ex_df <- tibble(
  x = c("a", "a", rep("b", 3), rep("a", 5)),
  num = 1:10

# print the data
# A tibble: 10 × 2
   x       num
   <chr> <int>
 1 a         1
 2 a         2
 3 b         3
 4 b         4
 5 b         5
 6 a         6
 7 a         7
 8 a         8
 9 a         9
10 a        10

Example of rle or run-length encoding

# rle or run-length encoding
# summarizes a vector into the length of each repeat
# and the value that is repeated
# technically this is a form of recoverable data compression
# IE you end up with fewer bytes but it tells you what a long vector
# could be, and can be recreated

# this can be read as the betters a, b, a
# where the first a is repeated 2x
# the b is repeated 3x
# the next a is repeated 5x
Run Length Encoding
  lengths: int [1:3] 2 3 5
  values : chr [1:3] "a" "b" "a"

Example of rleid

# rleid() generates the ids or repeated group of equal length
# to the original vector

 [1] "a" "a" "b" "b" "b" "a" "a" "a" "a" "a"
 [1] 1 1 2 2 2 3 3 3 3 3
# it can be used on a vector, in a dataframe, in a datatable or a tibble
# note that it can be used within mutate() since it returns
# a vector of equal length, ie the number of rows is not changed
ex_df %>% 
  mutate(rleid = data.table::rleid(x))
# A tibble: 10 × 3
   x       num rleid
   <chr> <int> <int>
 1 a         1     1
 2 a         2     1
 3 b         3     2
 4 b         4     2
 5 b         5     2
 6 a         6     3
 7 a         7     3
 8 a         8     3
 9 a         9     3
10 a        10     3

rle is a summary function

# note that rle() is a _summary_ function, and generates fewer rows
ex_df %>% 
  summarize(lengths = rle(x)$lengths,
            values =rle(x)$values)
# A tibble: 3 × 2
  lengths values
    <int> <chr> 
1       2 a     
2       3 b     
3       5 a     

Recover the original data

# we can create a summary
# and then recover the original data
final_df <- ex_df %>% 
    lengths = rle(x)$lengths,
    values =rle(x)$values
    ) %T>% print() %>% 
    x = rep(values, times=lengths),
    num = 1:sum(lengths)
# A tibble: 3 × 2
  lengths values
    <int> <chr> 
1       2 a     
2       3 b     
3       5 a     
# A tibble: 10 × 2
   x       num
   <chr> <int>
 1 a         1
 2 a         2
 3 b         3
 4 b         4
 5 b         5
 6 a         6
 7 a         7
 8 a         8
 9 a         9
10 a        10
# original and recreation are identical
all.equal(final_df, ex_df)
[1] TRUE
