Title: | A Poor Man's Dependency Free Recreation of 'dplyr' |
---|---|
Description: | A replication of key functionality from 'dplyr' and the wider 'tidyverse' using only 'base'. |
Authors: | Nathan Eastwood [aut, cre],
Etienne Bacher [ctb] |
Maintainer: | Nathan Eastwood <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.2.7 |
Built: | 2025-02-25 04:15:49 UTC |
Source: | https://github.com/nathaneastwood/poorman |
across()
makes it easy to apply the same transformation to multiple columns, allowing you to use select()
semantics inside in "data-masking" functions like summarise()
and mutate()
.
if_any()
and if_all()
are used to apply the same predicate function to a selection of columns and combine the
results into a single logical vector.
across()
supersedes the family of dplyr
"scoped variants" like summarise_at()
, summarise_if()
, and
summarise_all()
and therefore these functions will not be implemented in poorman
.
across(.cols = everything(), .fns = NULL, ..., .names = NULL) if_any(.cols, .fns = NULL, ..., .names = NULL) if_all(.cols, .fns = NULL, ..., .names = NULL)
across(.cols = everything(), .fns = NULL, ..., .names = NULL) if_any(.cols, .fns = NULL, ..., .names = NULL) if_all(.cols, .fns = NULL, ..., .names = NULL)
.fns |
Functions to apply to each of the selected columns. Possible values are:
Within these functions you can use |
... |
Additional arguments for the function calls in |
.names |
A glue specification that describes how to name the output
columns. This can use |
cols , .cols
|
< |
across()
returns a data.frame
with one column for each column in .cols
and each function in .fns
.
if_any()
and if_all()
return a logical vector.
# across() ----------------------------------------------------------------- iris %>% group_by(Species) %>% summarise(across(starts_with("Sepal"), mean)) iris %>% mutate(across(where(is.factor), as.character)) # Additional parameters can be passed to functions iris %>% group_by(Species) %>% summarise(across(starts_with("Sepal"), mean, na.rm = TRUE)) # A named list of functions iris %>% group_by(Species) %>% summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd))) # Use the .names argument to control the output names iris %>% group_by(Species) %>% summarise( across(starts_with("Sepal"), mean, .names = c("mean_sepal_length", "mean_sepal_width")) ) # if_any() and if_all() ---------------------------------------------------- iris %>% filter(if_any(ends_with("Width"), ~ . > 4)) iris %>% filter(if_all(ends_with("Width"), ~ . > 2))
# across() ----------------------------------------------------------------- iris %>% group_by(Species) %>% summarise(across(starts_with("Sepal"), mean)) iris %>% mutate(across(where(is.factor), as.character)) # Additional parameters can be passed to functions iris %>% group_by(Species) %>% summarise(across(starts_with("Sepal"), mean, na.rm = TRUE)) # A named list of functions iris %>% group_by(Species) %>% summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd))) # Use the .names argument to control the output names iris %>% group_by(Species) %>% summarise( across(starts_with("Sepal"), mean, .names = c("mean_sepal_length", "mean_sepal_width")) ) # if_any() and if_all() ---------------------------------------------------- iris %>% filter(if_any(ends_with("Width"), ~ . > 4)) iris %>% filter(if_all(ends_with("Width"), ~ . > 2))
Order rows of a data.frame
by an expression involving its variables.
arrange(.data, ...)
arrange(.data, ...)
.data |
A |
... |
A comma separated vector of unquoted name(s) to order the data by. |
A data.frame
.
arrange(mtcars, mpg) mtcars %>% arrange(mpg) mtcars %>% arrange(cyl, mpg)
arrange(mtcars, mpg) mtcars %>% arrange(mpg) mtcars %>% arrange(cyl, mpg)
This is a shortcut for x >= left & x <= right
.
between(x, left, right)
between(x, left, right)
x |
A |
left , right
|
Boundary values. |
A logical
vector the same length as x
.
between(1:12, 7, 9) x <- rnorm(1e2) x[between(x, -1, 1)]
between(1:12, 7, 9) x <- rnorm(1e2) x[between(x, -1, 1)]
data.frame
s by row and columnEfficiently bind multiple data.frame
s by row and column
bind_cols(...) bind_rows(..., .id = NULL)
bind_cols(...) bind_rows(..., .id = NULL)
... |
Each argument can either be a When row-binding, columns are matched by name, and any missing columns will be filled with When column-binding, rows are matched by position, so all |
.id |
When |
one <- mtcars[1:4, ] two <- mtcars[9:12, ] # You can supply data frames as arguments: bind_rows(one, two) # The contents of lists are spliced automatically: bind_rows(list(one, two)) bind_rows(split(mtcars, mtcars$cyl)) bind_rows(list(one, two), list(two, one)) # In addition to data frames, you can supply vectors. In the rows # direction, the vectors represent rows and should have inner # names: bind_rows( c(a = 1, b = 2), c(a = 3, b = 4) ) # You can mix vectors and data frames: bind_rows( c(a = 1, b = 2), data.frame(a = 3:4, b = 5:6), c(a = 7, b = 8) ) # When you supply a column name with the `.id` argument, a new # column is created to link each row to its original data frame bind_rows(list(one, two), .id = "id") bind_rows(list(a = one, b = two), .id = "id") bind_rows("group 1" = one, "group 2" = two, .id = "groups") ## Not run: # Rows need to match when column-binding bind_cols(data.frame(x = 1:3), data.frame(y = 1:2)) # even with 0 columns bind_cols(data.frame(x = 1:3), data.frame()) ## End(Not run) bind_cols(one, two) bind_cols(list(one, two))
one <- mtcars[1:4, ] two <- mtcars[9:12, ] # You can supply data frames as arguments: bind_rows(one, two) # The contents of lists are spliced automatically: bind_rows(list(one, two)) bind_rows(split(mtcars, mtcars$cyl)) bind_rows(list(one, two), list(two, one)) # In addition to data frames, you can supply vectors. In the rows # direction, the vectors represent rows and should have inner # names: bind_rows( c(a = 1, b = 2), c(a = 3, b = 4) ) # You can mix vectors and data frames: bind_rows( c(a = 1, b = 2), data.frame(a = 3:4, b = 5:6), c(a = 7, b = 8) ) # When you supply a column name with the `.id` argument, a new # column is created to link each row to its original data frame bind_rows(list(one, two), .id = "id") bind_rows(list(a = one, b = two), .id = "id") bind_rows("group 1" = one, "group 2" = two, .id = "groups") ## Not run: # Rows need to match when column-binding bind_cols(data.frame(x = 1:3), data.frame(y = 1:2)) # even with 0 columns bind_cols(data.frame(x = 1:3), data.frame()) ## End(Not run) bind_cols(one, two) bind_cols(list(one, two))
if()
This function allows you to vectorise multiple if_else()
statements. It is an R equivalent of the SQL CASE WHEN
statement. If no cases match, NA
is returned.
case_when(...)
case_when(...)
... |
A sequence of two-sided formulas. The left hand side (LHS) determines which values match this case. The right hand side (RHS) provides the replacement value. The LHS must evaluate to a logical vector. The RHS does not need to be logical, but all RHSs must evaluate to the same type of vector. Both LHS and RHS may have the same length of either 1 or n. The value of n must be consistent across all cases. The
case of
|
A vector of length 1 or n, matching the length of the logical input or output vectors, with the type (and attributes) of the first RHS. Inconsistent lengths or types will generate an error.
x <- 1:50 case_when( x %% 35 == 0 ~ "fizz buzz", x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", TRUE ~ as.character(x) ) # Like an if statement, the arguments are evaluated in order, so you must # proceed from the most specific to the most general. This won't work: case_when( TRUE ~ as.character(x), x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", x %% 35 == 0 ~ "fizz buzz" ) # If none of the cases match, NA is used: case_when( x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", x %% 35 == 0 ~ "fizz buzz" ) # Note that NA values in the vector x do not get special treatment. If you want # to explicitly handle NA values you can use the `is.na` function: x[2:4] <- NA_real_ case_when( x %% 35 == 0 ~ "fizz buzz", x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", is.na(x) ~ "nope", TRUE ~ as.character(x) ) # All RHS values need to be of the same type. Inconsistent types will throw an error. # This applies also to NA values used in RHS: NA is logical, use # typed values like NA_real_, NA_complex, NA_character_, NA_integer_ as appropriate. case_when( x %% 35 == 0 ~ NA_character_, x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", TRUE ~ as.character(x) ) case_when( x %% 35 == 0 ~ 35, x %% 5 == 0 ~ 5, x %% 7 == 0 ~ 7, TRUE ~ NA_real_ ) # case_when() evaluates all RHS expressions, and then constructs its # result by extracting the selected (via the LHS expressions) parts. # In particular NaN are produced in this case: y <- seq(-2, 2, by = .5) case_when( y >= 0 ~ sqrt(y), TRUE ~ y ) ## Not run: case_when( x %% 35 == 0 ~ 35, x %% 5 == 0 ~ 5, x %% 7 == 0 ~ 7, TRUE ~ NA ) ## End(Not run) # case_when is particularly useful inside mutate when you want to # create a new variable that relies on a complex combination of existing # variables mtcars %>% mutate( efficient = case_when( mpg > 25 ~ TRUE, TRUE ~ FALSE ) )
x <- 1:50 case_when( x %% 35 == 0 ~ "fizz buzz", x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", TRUE ~ as.character(x) ) # Like an if statement, the arguments are evaluated in order, so you must # proceed from the most specific to the most general. This won't work: case_when( TRUE ~ as.character(x), x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", x %% 35 == 0 ~ "fizz buzz" ) # If none of the cases match, NA is used: case_when( x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", x %% 35 == 0 ~ "fizz buzz" ) # Note that NA values in the vector x do not get special treatment. If you want # to explicitly handle NA values you can use the `is.na` function: x[2:4] <- NA_real_ case_when( x %% 35 == 0 ~ "fizz buzz", x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", is.na(x) ~ "nope", TRUE ~ as.character(x) ) # All RHS values need to be of the same type. Inconsistent types will throw an error. # This applies also to NA values used in RHS: NA is logical, use # typed values like NA_real_, NA_complex, NA_character_, NA_integer_ as appropriate. case_when( x %% 35 == 0 ~ NA_character_, x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", TRUE ~ as.character(x) ) case_when( x %% 35 == 0 ~ 35, x %% 5 == 0 ~ 5, x %% 7 == 0 ~ 7, TRUE ~ NA_real_ ) # case_when() evaluates all RHS expressions, and then constructs its # result by extracting the selected (via the LHS expressions) parts. # In particular NaN are produced in this case: y <- seq(-2, 2, by = .5) case_when( y >= 0 ~ sqrt(y), TRUE ~ y ) ## Not run: case_when( x %% 35 == 0 ~ 35, x %% 5 == 0 ~ 5, x %% 7 == 0 ~ 7, TRUE ~ NA ) ## End(Not run) # case_when is particularly useful inside mutate when you want to # create a new variable that relies on a complex combination of existing # variables mtcars %>% mutate( efficient = case_when( mpg > 25 ~ TRUE, TRUE ~ FALSE ) )
Given a set of vectors, coalesce()
finds the first non-missing value at each position. This is inspired by the SQL
COALESCE
function which does the same thing for NULL
s.
coalesce(...)
coalesce(...)
... |
Vectors. Inputs should be recyclable (either be length |
Currently, coalesce()
type checking does not take place.
na_if()
to replace specified values to a NA
.
replace_na()
to replace a NA
with a value.
# Use a single value to replace all missing vectors x <- sample(c(1:5, NA, NA, NA)) coalesce(x, 0L) # Or match together a complete vector from missing pieces y <- c(1, 2, NA, NA, 5) z <- c(NA, NA, 3, 4, 5) coalesce(y, z)
# Use a single value to replace all missing vectors x <- sample(c(1:5, NA, NA, NA)) coalesce(x, 0L) # Or match together a complete vector from missing pieces y <- c(1, 2, NA, NA, 5) z <- c(NA, NA, 3, 4, 5) coalesce(y, z)
These functions return information about the "current" group or "current" variable, so only work inside specific
contexts like summarise()
and mutate()
.
n()
gives the number of observations in the current group.
cur_data()
gives the current data for the current group (excluding grouping variables).
cur_data_all()
gives the current data for the current group (including grouping variables).
cur_group()
gives the group keys, a single row data.frame
containing a column for each grouping variable and
its value.
cur_group_id()
gives a unique numeric identifier for the current group.
cur_group_rows()
gives the rows the groups appear in the data.
cur_column()
gives the name of the current column (in across()
only).
n() cur_data() cur_data_all() cur_group() cur_group_id() cur_group_rows() cur_column()
n() cur_data() cur_data_all() cur_group() cur_group_id() cur_group_rows() cur_column()
data.table
If you're familiar with data.table
:
cur_data()
<-> .SD
cur_group_id()
<-> .GRP
cur_group()
<-> .BY
cur_group_rows()
<-> .I
See group_data()
for equivalent functions that return values for all groups.
df <- data.frame( g = sample(rep(letters[1:3], 1:3)), x = runif(6), y = runif(6), stringsAsFactors = FALSE ) gf <- df %>% group_by(g) gf %>% summarise(n = n()) gf %>% mutate(id = cur_group_id()) gf %>% summarise(row = cur_group_rows()) gf %>% summarise(data = list(cur_group())) gf %>% summarise(data = list(cur_data())) gf %>% summarise(data = list(cur_data_all())) gf %>% mutate(across(everything(), ~ paste(cur_column(), round(.x, 2))))
df <- data.frame( g = sample(rep(letters[1:3], 1:3)), x = runif(6), y = runif(6), stringsAsFactors = FALSE ) gf <- df %>% group_by(g) gf %>% summarise(n = n()) gf %>% mutate(id = cur_group_id()) gf %>% summarise(row = cur_group_rows()) gf %>% summarise(data = list(cur_group())) gf %>% summarise(data = list(cur_data())) gf %>% summarise(data = list(cur_data_all())) gf %>% mutate(across(everything(), ~ paste(cur_column(), round(.x, 2))))
count()
lets you quickly count the unique values of one or more variables:
df %>% count(a, b)
is roughly equivalent to
df %>% group_by(a, b) %>% summarise(n = n())
.
count()
is paired with tally()
, a lower-level helper that is equivalent to df %>% summarise(n = n())
. Supply
wt
to perform weighted counts, switching the summary from from n = n()
to n = sum(wt)
.
add_count()
and add_tally()
are equivalent to count()
and tally()
but use mutate()
instead of summarise()
so that they add a new column with group-wise counts.
count(x, ..., wt = NULL, sort = FALSE, name = NULL) tally(x, wt = NULL, sort = FALSE, name = NULL) add_count(x, ..., wt = NULL, sort = FALSE, name = NULL) add_tally(x, wt = NULL, sort = FALSE, name = NULL)
count(x, ..., wt = NULL, sort = FALSE, name = NULL) tally(x, wt = NULL, sort = FALSE, name = NULL) add_count(x, ..., wt = NULL, sort = FALSE, name = NULL) add_tally(x, wt = NULL, sort = FALSE, name = NULL)
x |
A |
... |
Variables to group by. |
wt |
If omitted, will count the number of rows. If specified, will perform a "weighted" count by summing the
(non-missing) values of variable |
sort |
|
name |
|
A data.frame
. count()
and add_count()
have the same groups as the input.
# count() is a convenient way to get a sense of the distribution of # values in a dataset mtcars %>% count(cyl) mtcars %>% count(cyl, sort = TRUE) mtcars %>% count(cyl, am, sort = TRUE) # Note that if the data are already grouped, count() adds an additional grouping variable # which is removed afterwards mtcars %>% group_by(gear) %>% count(cyl) # tally() is a lower-level function that assumes you've done the grouping mtcars %>% tally() mtcars %>% group_by(cyl) %>% tally() # both count() and tally() have add_ variants that work like mutate() instead of summarise mtcars %>% add_count(cyl, wt = am) mtcars %>% add_tally(wt = am)
# count() is a convenient way to get a sense of the distribution of # values in a dataset mtcars %>% count(cyl) mtcars %>% count(cyl, sort = TRUE) mtcars %>% count(cyl, am, sort = TRUE) # Note that if the data are already grouped, count() adds an additional grouping variable # which is removed afterwards mtcars %>% group_by(gear) %>% count(cyl) # tally() is a lower-level function that assumes you've done the grouping mtcars %>% tally() mtcars %>% group_by(cyl) %>% tally() # both count() and tally() have add_ variants that work like mutate() instead of summarise mtcars %>% add_count(cyl, wt = am) mtcars %>% add_tally(wt = am)
poorman provides cumall()
, cumany()
, and cummean()
to complete R's set of cumulative functions.
cummean(x) cumany(x) cumall(x)
cummean(x) cumany(x) cumall(x)
x |
For |
A vector the same length as x
.
These are particularly useful in conjunction with filter()
:
cumall(x)
: all cases until the first FALSE
.
cumall(!x)
: all cases until the first TRUE
.
cumany(x)
: all cases after the first TRUE
.
cumany(!x)
: all cases after the first FALSE
.
# `cummean()` returns a numeric/integer vector of the same length # as the input vector. x <- c(1, 3, 5, 2, 2) cummean(x) cumsum(x) / seq_along(x) # `cumall()` and `cumany()` return logicals cumall(x < 5) cumany(x == 3) # `cumall()` vs. `cumany()` df <- data.frame( date = as.Date("2020-01-01") + 0:6, balance = c(100, 50, 25, -25, -50, 30, 120) ) # all rows after first overdraft df %>% filter(cumany(balance < 0)) # all rows until first overdraft df %>% filter(cumall(!(balance < 0)))
# `cummean()` returns a numeric/integer vector of the same length # as the input vector. x <- c(1, 3, 5, 2, 2) cummean(x) cumsum(x) / seq_along(x) # `cumall()` and `cumany()` return logicals cumall(x < 5) cumany(x == 3) # `cumall()` vs. `cumany()` df <- data.frame( date = as.Date("2020-01-01") + 0:6, balance = c(100, 50, 25, -25, -50, 30, 120) ) # all rows after first overdraft df %>% filter(cumany(balance < 0)) # all rows until first overdraft df %>% filter(cumall(!(balance < 0)))
Transform a vector into a format that will be sorted in descending order. This is useful within arrange()
.
desc(x)
desc(x)
x |
A vector to transform. |
A vector of the same length as x
.
desc(1:10) desc(factor(letters)) first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years") desc(first_day) mtcars %>% arrange(desc(mpg))
desc(1:10) desc(factor(letters)) first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"), "years") desc(first_day) mtcars %>% arrange(desc(mpg))
Select only distinct/unique rows from a data.frame
.
distinct(.data, ..., .keep_all = FALSE)
distinct(.data, ..., .keep_all = FALSE)
.data |
A |
... |
Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs, only the first row will be preserved. If omitted, will use all variables. |
.keep_all |
|
A data.frame
with the following properties:
Rows are a subset of the input but appear in the same order.
Columns are not modified if ...
is empty or .keep_all
is TRUE
. Otherwise, distinct()
first calls mutate()
to create new columns.
Groups are not modified.
data.frame
attributes are preserved.
df <- data.frame( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) nrow(df) nrow(distinct(df)) nrow(distinct(df, x, y)) distinct(df, x) distinct(df, y) # You can choose to keep all other variables as well distinct(df, x, .keep_all = TRUE) distinct(df, y, .keep_all = TRUE) # You can also use distinct on computed variables distinct(df, diff = abs(x - y)) # The same behaviour applies for grouped data frames, # except that the grouping variables are always included df <- data.frame( g = c(1, 1, 2, 2), x = c(1, 1, 2, 1) ) %>% group_by(g) df %>% distinct(x)
df <- data.frame( x = sample(10, 100, rep = TRUE), y = sample(10, 100, rep = TRUE) ) nrow(df) nrow(distinct(df)) nrow(distinct(df, x, y)) distinct(df, x) distinct(df, y) # You can choose to keep all other variables as well distinct(df, x, .keep_all = TRUE) distinct(df, y, .keep_all = TRUE) # You can also use distinct on computed variables distinct(df, diff = abs(x - y)) # The same behaviour applies for grouped data frames, # except that the grouping variables are always included df <- data.frame( g = c(1, 1, 2, 2), x = c(1, 1, 2, 1) ) %>% group_by(g) df %>% distinct(x)
Fills missing values in selected columns using the next or previous entry. This is useful in the common output format where values are not repeated, and are only recorded when they change.
fill(data, ..., .direction = c("down", "up", "downup", "updown"))
fill(data, ..., .direction = c("down", "up", "downup", "updown"))
data |
A |
... |
Columns to fill. |
.direction |
Direction in which to fill missing values. Currently either |
Missing values are replaced in atomic vectors; NULL
s are replaced in lists.
# Value (year) is recorded only when it changes sales <- data.frame( quarter = c( "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4" ), year = c(2000, NA, NA, NA, 2001, NA, NA, NA, 2002, NA, NA, NA, 2004, NA, NA, NA), sales = c( 66013, 69182, 53175, 21001, 46036, 58842, 44568, 50197, 39113, 41668, 30144, 52897, 32129, 67686, 31768, 49094 ) ) # `fill()` defaults to replacing missing data from top to bottom sales %>% fill(year) # Value (pet_type) is missing above tidy_pets <- data.frame( rank = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), pet_type = c(NA, NA, NA, NA, NA, "Dog", NA, NA, NA, NA, NA, "Cat"), breed = c( "Boston Terrier", "Retrievers (Labrador)", "Retrievers (Golden)", "French Bulldogs", "Bulldogs", "Beagles", "Persian", "Maine Coon", "Ragdoll", "Exotic", "Siamese", "American Short" ) ) # For values that are missing above you can use `.direction = "up"` tidy_pets %>% fill(pet_type, .direction = "up") # Value (n_squirrels) is missing above and below within a group squirrels <- data.frame( group = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), name = c( "Sam", "Mara", "Jesse", "Tom", "Mike", "Rachael", "Sydekea", "Gabriela", "Derrick", "Kara", "Emily", "Danielle" ), role = c( "Observer", "Scorekeeper", "Observer", "Observer", "Observer", "Observer", "Scorekeeper", "Observer", "Observer", "Scorekeeper", "Observer", "Observer" ), n_squirrels = c(NA, 8, NA, NA, NA, NA, 14, NA, NA, 9, NA, NA) ) # The values are inconsistently missing by position within the group # Use .direction = "downup" to fill missing values in both directions squirrels %>% group_by(group) %>% fill(n_squirrels, .direction = "downup") %>% ungroup() # Using `.direction = "updown"` accomplishes the same goal in this example
# Value (year) is recorded only when it changes sales <- data.frame( quarter = c( "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4" ), year = c(2000, NA, NA, NA, 2001, NA, NA, NA, 2002, NA, NA, NA, 2004, NA, NA, NA), sales = c( 66013, 69182, 53175, 21001, 46036, 58842, 44568, 50197, 39113, 41668, 30144, 52897, 32129, 67686, 31768, 49094 ) ) # `fill()` defaults to replacing missing data from top to bottom sales %>% fill(year) # Value (pet_type) is missing above tidy_pets <- data.frame( rank = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L), pet_type = c(NA, NA, NA, NA, NA, "Dog", NA, NA, NA, NA, NA, "Cat"), breed = c( "Boston Terrier", "Retrievers (Labrador)", "Retrievers (Golden)", "French Bulldogs", "Bulldogs", "Beagles", "Persian", "Maine Coon", "Ragdoll", "Exotic", "Siamese", "American Short" ) ) # For values that are missing above you can use `.direction = "up"` tidy_pets %>% fill(pet_type, .direction = "up") # Value (n_squirrels) is missing above and below within a group squirrels <- data.frame( group = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), name = c( "Sam", "Mara", "Jesse", "Tom", "Mike", "Rachael", "Sydekea", "Gabriela", "Derrick", "Kara", "Emily", "Danielle" ), role = c( "Observer", "Scorekeeper", "Observer", "Observer", "Observer", "Observer", "Scorekeeper", "Observer", "Observer", "Scorekeeper", "Observer", "Observer" ), n_squirrels = c(NA, 8, NA, NA, NA, NA, 14, NA, NA, 9, NA, NA) ) # The values are inconsistently missing by position within the group # Use .direction = "downup" to fill missing values in both directions squirrels %>% group_by(group) %>% fill(n_squirrels, .direction = "downup") %>% ungroup() # Using `.direction = "updown"` accomplishes the same goal in this example
Use filter()
to choose rows/cases where conditions are TRUE
.
filter(.data, ..., .preserve = FALSE)
filter(.data, ..., .preserve = FALSE)
.data |
A |
... |
Logical predicated defined in terms of the variables in |
.preserve |
|
A data.frame
.
==
, >
, >=
, etc.
&
, |
, !
, xor()
is.na()
filter(mtcars, am == 1) mtcars %>% filter(cyl == 4) mtcars %>% filter(cyl <= 5 & am > 0) mtcars %>% filter(cyl == 4 | cyl == 8) mtcars %>% filter(!(cyl %in% c(4, 6)), am != 0)
filter(mtcars, am == 1) mtcars %>% filter(cyl == 4) mtcars %>% filter(cyl <= 5 & am > 0) mtcars %>% filter(cyl == 4 | cyl == 8) mtcars %>% filter(!(cyl %in% c(4, 6)), am != 0)
x
based on the presence or absence of
matches in y
:semi_join()
return all rows from x
with a match in y
.
anti_join()
return all rows from x
without a match in y
.
anti_join(x, y, by = NULL) semi_join(x, y, by = NULL)
anti_join(x, y, by = NULL) semi_join(x, y, by = NULL)
x , y
|
The |
by |
A character vector of variables to join by. If |
table1 <- data.frame( pupil = rep(1:3, each = 2), test = rep(c("A", "B"), 3), score = c(60, 70, 65, 80, 85, 70), stringsAsFactors = FALSE ) table2 <- table1[c(1, 3, 4), ] table1 %>% anti_join(table2, by = c("pupil", "test")) table1 %>% semi_join(table2, by = c("pupil", "test"))
table1 <- data.frame( pupil = rep(1:3, each = 2), test = rep(c("A", "B"), 3), score = c(60, 70, 65, 80, 85, 70), stringsAsFactors = FALSE ) table2 <- table1[c(1, 3, 4), ] table1 %>% anti_join(table2, by = c("pupil", "test")) table1 %>% semi_join(table2, by = c("pupil", "test"))
glimpse()
is like a transposed version of print(): columns run down the page, and data runs across. This makes it
possible to see every column in a data.frame
. It is no more than a wrapper around utils::str()
only it returns
the input (invisibly) meaning it can be used within a data pipeline.
glimpse(x, width = getOption("width"), ...)
glimpse(x, width = getOption("width"), ...)
x |
An object to glimpse at. |
width |
|
... |
Additional parameters to pass to |
x
, invisibly.
glimpse(mtcars)
glimpse(mtcars)
Determine the groups within a data.frame
to perform operations on. ungroup()
removes the grouping levels.
group_by(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) ungroup(x, ...)
group_by(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) ungroup(x, ...)
.data |
|
... |
One or more unquoted column names to group/ungroup the data by. |
.add |
|
.drop |
|
x |
A |
When using group_by()
, a data.frame
, grouped by the grouping variables.
When using ungroup()
, a data.frame
.
group_by(mtcars, am, cyl) ungroup(mutate(group_by(mtcars, am, cyl), sumMpg = sum(mpg))) mtcars %>% group_by(am, cyl) %>% mutate(sumMpg = sum(mpg)) %>% ungroup() mtcars %>% group_by(carb) %>% filter(any(gear == 5)) # You can group by expressions: this is just short-hand for # a mutate() followed by a group_by() mtcars %>% group_by(vsam = vs + am)
group_by(mtcars, am, cyl) ungroup(mutate(group_by(mtcars, am, cyl), sumMpg = sum(mpg))) mtcars %>% group_by(am, cyl) %>% mutate(sumMpg = sum(mpg)) %>% ungroup() mtcars %>% group_by(carb) %>% filter(any(gear == 5)) # You can group by expressions: this is just short-hand for # a mutate() followed by a group_by() mtcars %>% group_by(vsam = vs + am)
Default value for .drop argument of group_by
group_by_drop_default(.tbl)
group_by_drop_default(.tbl)
.tbl |
A |
TRUE
unless .tbl
is a grouped data.frame
that was previously obtained by group_by(.drop = FALSE)
group_by_drop_default(iris) iris %>% group_by(Species) %>% group_by_drop_default() iris %>% group_by(Species, .drop = FALSE) %>% group_by_drop_default()
group_by_drop_default(iris) iris %>% group_by(Species) %>% group_by_drop_default() iris %>% group_by(Species, .drop = FALSE) %>% group_by_drop_default()
This selection helper matches grouping variables. It can be used within select()
and relocate()
selections.
group_cols()
group_cols()
groups()
and group_vars()
for retrieving the grouping variables outside selection contexts.
mtcars %>% group_by(am, cyl) %>% select(group_cols())
mtcars %>% group_by(am, cyl) %>% select(group_cols())
group_data()
returns a data frame that defines the grouping structure. The columns give the values of the
grouping variables. The last column, always called .rows
, is a list of integer vectors that gives the location of
the rows in each group.
group_rows()
returns the rows which each group contains.
group_indices()
returns an integer vector the same length as .data
that gives the group that each row belongs
to.
group_vars()
gives names of grouping variables as character vector.
groups()
gives the names as a list of symbols.
group_size()
gives the size of each group.
n_groups()
gives the total number of groups.
group_data(.data) group_rows(.data) group_indices(.data) group_vars(x) groups(x) group_size(x) n_groups(x)
group_data(.data) group_rows(.data) group_indices(.data) group_vars(x) groups(x) group_size(x) n_groups(x)
.data , x
|
A |
See context for equivalent functions that return values for the current group.
df <- data.frame(x = c(1,1,2,2)) group_vars(df) group_rows(df) group_data(df) gf <- group_by(df, x) group_vars(gf) group_rows(gf) group_data(gf)
df <- data.frame(x = c(1,1,2,2)) group_vars(df) group_rows(df) group_data(df) gf <- group_by(df, x) group_vars(gf) group_rows(gf) group_data(gf)
group_split()
works like base::split()
but
it uses the grouping structure from group_by()
and is therefore subject to the data mask
it does not name the elements of the list based on the grouping as this typically loses information and is confusing
group_split(.data, ..., .keep = TRUE) group_keys(.data)
group_split(.data, ..., .keep = TRUE) group_keys(.data)
.data |
A |
... |
Grouping specification, forwarded to |
.keep |
|
Grouped data.frame
s:
The primary use case for group_split()
is with already grouped data.frame
s, typically a result of group_by()
.
In this case, group_split()
only uses the first argument, the grouped data.frame
, and warns when ...
is used.
Because some of these groups may be empty, it is best paired with group_keys()
which identifies the representatives
of each grouping variable for the group.
Ungrouped data.frame
s:
When used on ungrouped data.frame
s, group_split()
forwards the ...
to group_by()
before the split, therefore
the ...
are subject to the data mask.
group_split()
returns a list of data.frame
s. Each data.frame
contains the rows of .data
with the associated
group and all the columns, including the grouping variables.
group_keys()
returns a data.frame
with one row per group, and one column per grouping variable
# Grouped data.frames: mtcars %>% group_by(cyl, am) %>% group_split() mtcars %>% group_by(cyl, am) %>% group_split(.keep = FALSE) mtcars %>% group_by(cyl, am) %>% group_keys() # Ungrouped data.frames: mtcars %>% group_split(am, cyl)
# Grouped data.frames: mtcars %>% group_by(cyl, am) %>% group_split() mtcars %>% group_by(cyl, am) %>% group_split(.keep = FALSE) mtcars %>% group_by(cyl, am) %>% group_keys() # Ungrouped data.frames: mtcars %>% group_split(am, cyl)
This is a wrapper around ifelse()
which checks that true
and false
are of the same type, making the output more
predictable.
if_else(condition, true, false, missing = NULL)
if_else(condition, true, false, missing = NULL)
condition |
A |
true , false
|
Values to use for |
missing |
If not |
A vector the same length as condition
with values for TRUE
and FALSE
replaced by those specified in
true
and false
, respectively.
x <- c(-5:5, NA) if_else(x < 0, NA_integer_, x) if_else(x < 0, "negative", "positive", "missing") # Unlike ifelse, if_else preserves types x <- factor(sample(letters[1:5], 10, replace = TRUE)) ifelse(x %in% c("a", "b", "c"), x, factor(NA)) # Attributes are taken from the `true` vector if_else(x %in% c("a", "b", "c"), x, factor(NA))
x <- c(-5:5, NA) if_else(x < 0, NA_integer_, x) if_else(x < 0, "negative", "positive", "missing") # Unlike ifelse, if_else preserves types x <- factor(sample(letters[1:5], 10, replace = TRUE)) ifelse(x %in% c("a", "b", "c"), x, factor(NA)) # Attributes are taken from the `true` vector if_else(x %in% c("a", "b", "c"), x, factor(NA))
Find the "previous" (lag()
) or "next" (lead()
) values in a vector. Useful for comparing values behind of or ahead
of the current values.
lag(x, n = 1L, default = NA) lead(x, n = 1L, default = NA)
lag(x, n = 1L, default = NA) lead(x, n = 1L, default = NA)
x |
A |
n |
A positive |
default |
The value used for non-existent rows (default: |
lag(1:5) lead(1:5) x <- 1:5 data.frame(behind = lag(x), x, ahead = lead(x)) # If you want to look more rows behind or ahead, use `n` lag(1:5, n = 1) lag(1:5, n = 2) lead(1:5, n = 1) lead(1:5, n = 2) # If you want to define a value for non-existing rows, use `default` lag(1:5) lag(1:5, default = 0) lead(1:5) lead(1:5, default = 6)
lag(1:5) lead(1:5) x <- 1:5 data.frame(behind = lag(x), x, ahead = lead(x)) # If you want to look more rows behind or ahead, use `n` lag(1:5, n = 1) lag(1:5, n = 2) lead(1:5, n = 1) lead(1:5, n = 2) # If you want to define a value for non-existing rows, use `default` lag(1:5) lag(1:5, default = 0) lead(1:5) lead(1:5, default = 6)
lst()
constructs a list, similar to base::list()
, but where components
are built sequentially. When defining a component, you can refer to components
created earlier in the call. lst()
also generates missing names
automatically.
lst(...)
lst(...)
... |
Named or unnamed elements of a list. If the element is unnamed, its expression will be used as its name. |
A named list.
# the value of n can be used immediately in the definition of x lst(n = 5, x = runif(n)) # missing names are constructed from user's input lst(1:3, z = letters[4:6], runif(3)) a <- 1:3 b <- letters[4:6] lst(a, b)
# the value of n can be used immediately in the definition of x lst(n = 5, x = runif(n)) # missing names are constructed from user's input lst(1:3, z = letters[4:6], runif(3)) a <- 1:3 b <- letters[4:6] lst(a, b)
mutate()
adds new variables and preserves existing ones; transmute()
adds new variables and drops existing ones.
Both functions preserve the number of rows of the input. New variables overwrite existing variables of the same name.
Variables can be removed by setting their value to NULL
.
mutate(.data, ...) ## S3 method for class 'data.frame' mutate( .data, ..., .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL ) transmute(.data, ...)
mutate(.data, ...) ## S3 method for class 'data.frame' mutate( .data, ..., .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL ) transmute(.data, ...)
.data |
A |
... |
Name-value pairs of expressions, each with length |
.keep |
This argument allows you to control which columns from
Grouping variables are always kept, unconditional to |
.before , .after
|
< |
mutate(mtcars, mpg2 = mpg * 2) mtcars %>% mutate(mpg2 = mpg * 2) mtcars %>% mutate(mpg2 = mpg * 2, cyl2 = cyl * 2) # Newly created variables are available immediately mtcars %>% mutate(mpg2 = mpg * 2, mpg4 = mpg2 * 2) # You can also use mutate() to remove variables and modify existing variables mtcars %>% mutate( mpg = NULL, disp = disp * 0.0163871 # convert to litres ) # By default, new columns are placed on the far right. # You can override this with `.before` or `.after`. df <- data.frame(x = 1, y = 2) df %>% mutate(z = x + y) df %>% mutate(z = x + y, .before = 1) df %>% mutate(z = x + y, .after = x) # By default, mutate() keeps all columns from the input data. # You can override with `.keep` df <- data.frame( x = 1, y = 2, a = "a", b = "b", stringsAsFactors = FALSE ) df %>% mutate(z = x + y, .keep = "all") # the default df %>% mutate(z = x + y, .keep = "used") df %>% mutate(z = x + y, .keep = "unused") df %>% mutate(z = x + y, .keep = "none") # same as transmute() # mutate() vs transmute -------------------------- # mutate() keeps all existing variables mtcars %>% mutate(displ_l = disp / 61.0237) # transmute keeps only the variables you create mtcars %>% transmute(displ_l = disp / 61.0237)
mutate(mtcars, mpg2 = mpg * 2) mtcars %>% mutate(mpg2 = mpg * 2) mtcars %>% mutate(mpg2 = mpg * 2, cyl2 = cyl * 2) # Newly created variables are available immediately mtcars %>% mutate(mpg2 = mpg * 2, mpg4 = mpg2 * 2) # You can also use mutate() to remove variables and modify existing variables mtcars %>% mutate( mpg = NULL, disp = disp * 0.0163871 # convert to litres ) # By default, new columns are placed on the far right. # You can override this with `.before` or `.after`. df <- data.frame(x = 1, y = 2) df %>% mutate(z = x + y) df %>% mutate(z = x + y, .before = 1) df %>% mutate(z = x + y, .after = x) # By default, mutate() keeps all columns from the input data. # You can override with `.keep` df <- data.frame( x = 1, y = 2, a = "a", b = "b", stringsAsFactors = FALSE ) df %>% mutate(z = x + y, .keep = "all") # the default df %>% mutate(z = x + y, .keep = "used") df %>% mutate(z = x + y, .keep = "unused") df %>% mutate(z = x + y, .keep = "none") # same as transmute() # mutate() vs transmute -------------------------- # mutate() keeps all existing variables mtcars %>% mutate(displ_l = disp / 61.0237) # transmute keeps only the variables you create mtcars %>% transmute(displ_l = disp / 61.0237)
The mutating joins add columns from y
to x
, matching rows based on the keys:
inner_join()
: includes all rows in x
and y
.
left_join()
: includes all rows in x
.
right_join()
: includes all rows in y
.
full_join()
: includes all rows in x
or y
.
If a row in x
matches multiple rows in y
, all the rows in y
will be returned once for each matching row in x
.
inner_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., na_matches = c("na", "never") ) left_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., keep = FALSE, na_matches = c("na", "never") ) right_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., keep = FALSE, na_matches = c("na", "never") ) full_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., keep = FALSE, na_matches = c("na", "never") )
inner_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., na_matches = c("na", "never") ) left_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., keep = FALSE, na_matches = c("na", "never") ) right_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., keep = FALSE, na_matches = c("na", "never") ) full_join( x, y, by = NULL, suffix = c(".x", ".y"), ..., keep = FALSE, na_matches = c("na", "never") )
x , y
|
The |
by |
A character vector of variables to join by. If To join by different variables on x and y use a named vector. For example, To join by multiple variables, use a vector with length > 1. For example, To perform a cross-join, generating all combinations of |
suffix |
|
... |
Additional arguments to pass to |
na_matches |
Should The default, Use |
keep |
|
A data.frame
. The order of the rows and columns of x
is preserved as much as possible. The output has the
following properties:
For inner_join()
, a subset of x
rows.
For left_join()
, all x
rows.
For right_join()
, a subset of x
rows, followed by unmatched y
rows.
For full_join()
, all x
rows, followed by unmatched y
rows.
For all joins, rows will be duplicated if one or more rows in x
matches multiple rows in y
.
Output columns include all x
columns and all y
columns. If columns in x
and y
have the same name (and
aren't included in by
), suffix
es are added to disambiguate.
Output columns included in by
are coerced to common type across x
and y
.
Groups are taken from x
.
# If a row in `x` matches multiple rows in `y`, all the rows in `y` will be # returned once for each matching row in `x` df1 <- data.frame(x = 1:3) df2 <- data.frame(x = c(1, 1, 2), y = c("first", "second", "third")) df1 %>% left_join(df2) # By default, NAs match other NAs so that there are two # rows in the output of this join: df1 <- data.frame(x = c(1, NA), y = 2) df2 <- data.frame(x = c(1, NA), z = 3) left_join(df1, df2) # You can optionally request that NAs don't match, giving a # a result that more closely resembles SQL joins left_join(df1, df2, na_matches = "never")
# If a row in `x` matches multiple rows in `y`, all the rows in `y` will be # returned once for each matching row in `x` df1 <- data.frame(x = 1:3) df2 <- data.frame(x = c(1, 1, 2), y = c("first", "second", "third")) df1 %>% left_join(df2) # By default, NAs match other NAs so that there are two # rows in the output of this join: df1 <- data.frame(x = c(1, NA), y = 2) df2 <- data.frame(x = c(1, NA), z = 3) left_join(df1, df2) # You can optionally request that NAs don't match, giving a # a result that more closely resembles SQL joins left_join(df1, df2, na_matches = "never")
This is the equivalent of length(unique(x))
for multiple vectors.
n_distinct(..., na.rm = FALSE)
n_distinct(..., na.rm = FALSE)
... |
Vectors of values. |
na.rm |
|
x <- sample(1:10, 1e5, rep = TRUE) length(unique(x)) n_distinct(x)
x <- sample(1:10, 1e5, rep = TRUE) length(unique(x)) n_distinct(x)
This is a translation of the SQL command NULLIF
. It is useful if you want to convert an annoying value to NA
.
na_if(x, y)
na_if(x, y)
x |
The vector to modify. |
y |
The value to replace with |
A modified version of x
that replaces any values that are equal to y
with NA
.
coalesce()
to replace missing values within subsequent vector
(s) of value(s). replace_na()
to replace NA
with
a value.
replace_na()
to replace NA
with a value.
recode()
to more generally replace values.
na_if(1:5, 5:1) x <- c(1, -1, 0, 10) 100 / x 100 / na_if(x, 0) y <- c("abc", "def", "", "ghi") na_if(y, "") # na_if() is particularly useful inside mutate(), # and is meant for use with vectors rather than entire data.frames mtcars %>% mutate(cyl = na_if(cyl, 6))
na_if(1:5, 5:1) x <- c(1, -1, 0, 10) 100 / x 100 / na_if(x, 0) y <- c("abc", "def", "", "ghi") na_if(y, "") # na_if() is particularly useful inside mutate(), # and is meant for use with vectors rather than entire data.frames mtcars %>% mutate(cyl = na_if(cyl, 6))
This is a safe way of comparing if two vectors of floating point numbers are (pairwise) equal. This is safer than
using ==
, because it has a built in tolerance.
near(x, y, tol = .Machine$double.eps^0.5)
near(x, y, tol = .Machine$double.eps^0.5)
x , y
|
Numeric vectors to compare |
tol |
Tolerance of comparison. |
sqrt(2) ^ 2 == 2 near(sqrt(2) ^ 2, 2)
sqrt(2) ^ 2 == 2 near(sqrt(2) ^ 2, 2)
nest_by()
is similar to group_by()
however instead of storing the group structure in the metadata, it is made
explicit in the data. Each group key is given a single row within the data.frame
and the group's data is stored
within a list-column of the data.frame
.
nest_by(.data, ..., .key = "data", .keep = FALSE)
nest_by(.data, ..., .key = "data", .keep = FALSE)
.data |
A |
... |
Grouping specification, forwarded to |
.key |
|
.keep |
|
Currently there is no pretty-printing provided for the results of nest_by()
and they are not useable with other
functions such as mutate()
.
mtcars %>% nest_by(am, cyl) # Or equivalently mtcars %>% group_by(am, cyl) %>% nest_by()
mtcars %>% nest_by(am, cyl) # Or equivalently mtcars %>% group_by(am, cyl) %>% nest_by()
These are straightforward wrappers around [[
. The main advantage is that you can provide an optional
secondary vector that defines the ordering, and provide a default value to use when the input is shorter than
expected.
nth(x, n, order_by = NULL, default = default_missing(x)) first(x, order_by = NULL, default = default_missing(x)) last(x, order_by = NULL, default = default_missing(x))
nth(x, n, order_by = NULL, default = default_missing(x)) first(x, order_by = NULL, default = default_missing(x)) last(x, order_by = NULL, default = default_missing(x))
x |
A vector |
n |
For If a double is supplied, it will be silently truncated. |
order_by |
An optional vector used to determine the order |
default |
A default value to use if the position does not exist in the input. This is guessed by default for
base vectors, where a missing value of the appropriate type is returned, and for lists, where a For more complicated objects, you'll need to supply this value. Make sure it is the same type as |
A single value. [[
is used to do the subsetting.
x <- 1:10 y <- 10:1 first(x) last(y) nth(x, 1) nth(x, 5) nth(x, -2) nth(x, 11) last(x) # Second argument provides optional ordering last(x, y) # These functions always return a single value first(integer())
x <- 1:10 y <- 10:1 first(x) last(y) nth(x, 1) nth(x, 5) nth(x, -2) nth(x, 11) last(x) # Second argument provides optional ordering last(x, y) # These functions always return a single value first(integer())
Return the vector of column names of the data currently available for selection.
peek_vars()
peek_vars()
A vector of column names.
Pipe an object forward into a function or call expression.
lhs %>% rhs
lhs %>% rhs
lhs |
The result you are piping. |
rhs |
Where you are piping the result to. |
Nathan Eastwood and Antoine Fabri [email protected].
# Basic use: iris %>% head # Use with lhs as first argument iris %>% head(10) # Using the dot place-holder "Ceci n'est pas une pipe" %>% gsub("une", "un", .) # When dot is nested, lhs is still placed first: sample(1:10) %>% paste0(LETTERS[.]) # This can be avoided: rnorm(100) %>% {c(min(.), mean(.), max(.))} %>% floor # Lambda expressions: iris %>% { size <- sample(1:10, size = 1) rbind(head(., size), tail(., size)) } # renaming in lambdas: iris %>% { my_data <- . size <- sample(1:10, size = 1) rbind(head(my_data, size), tail(my_data, size)) }
# Basic use: iris %>% head # Use with lhs as first argument iris %>% head(10) # Using the dot place-holder "Ceci n'est pas une pipe" %>% gsub("une", "un", .) # When dot is nested, lhs is still placed first: sample(1:10) %>% paste0(LETTERS[.]) # This can be avoided: rnorm(100) %>% {c(min(.), mean(.), max(.))} %>% floor # Lambda expressions: iris %>% { size <- sample(1:10, size = 1) rbind(head(., size), tail(., size)) } # renaming in lambdas: iris %>% { my_data <- . size <- sample(1:10, size = 1) rbind(head(my_data, size), tail(my_data, size)) }
pivot_longer()
"lengthens" data, increasing the number of rows and decreasing the number of columns. The inverse
transformation is pivot_wider()
.
pivot_longer( data, cols, names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, values_to = "value", values_drop_na = FALSE, ... )
pivot_longer( data, cols, names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, values_to = "value", values_drop_na = FALSE, ... )
data |
|
cols |
< |
names_to |
|
names_prefix |
|
names_sep , names_pattern
|
|
values_to |
|
values_drop_na |
|
... |
Additional arguments passed on to methods. |
A data.frame
.
wide_data <- data.frame(replicate(5, rnorm(10))) # Customizing the names pivot_longer( data = wide_data, cols = c(1, 2), names_to = "Column", values_to = "Numbers" )
wide_data <- data.frame(replicate(5, rnorm(10))) # Customizing the names pivot_longer( data = wide_data, cols = c(1, 2), names_to = "Column", values_to = "Numbers" )
pivot_wider()
"widens" data, increasing the number of columns and decreasing the number of rows. The inverse
transformation is pivot_longer()
.
pivot_wider( data, id_cols = NULL, values_from = "Value", names_from = "Name", names_sep = "_", names_prefix = "", names_glue = NULL, values_fill = NULL, ... )
pivot_wider( data, id_cols = NULL, values_from = "Value", names_from = "Name", names_sep = "_", names_prefix = "", names_glue = NULL, values_fill = NULL, ... )
data |
|
id_cols |
|
values_from |
|
names_from |
|
names_sep |
|
names_prefix |
|
names_glue |
|
values_fill |
|
... |
Not used for now. |
If a tibble was provided as input, pivot_wider()
also returns a
tibble. Otherwise, it returns a data frame.
data_long <- read.table(header = TRUE, text = " subject sex condition measurement 1 M control 7.9 1 M cond1 12.3 1 M cond2 10.7 2 F control 6.3 2 F cond1 10.6 2 F cond2 11.1 3 F control 9.5 3 F cond1 13.1 3 F cond2 13.8 4 M control 11.5 4 M cond1 13.4 4 M cond2 12.9") pivot_wider( data_long, id_cols = "subject", names_from = "condition", values_from = "measurement" ) pivot_wider( data_long, id_cols = "subject", names_from = "condition", values_from = "measurement", names_prefix = "Var.", names_sep = "." ) production <- expand.grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) %>% filter((product == "A" & country == "AI") | product == "B") %>% mutate(production = rnorm(nrow(.))) pivot_wider( production, names_from = c("product", "country"), values_from = "production", names_glue = "prod_{product}_{country}" )
data_long <- read.table(header = TRUE, text = " subject sex condition measurement 1 M control 7.9 1 M cond1 12.3 1 M cond2 10.7 2 F control 6.3 2 F cond1 10.6 2 F cond2 11.1 3 F control 9.5 3 F cond1 13.1 3 F cond2 13.8 4 M control 11.5 4 M cond1 13.4 4 M cond2 12.9") pivot_wider( data_long, id_cols = "subject", names_from = "condition", values_from = "measurement" ) pivot_wider( data_long, id_cols = "subject", names_from = "condition", values_from = "measurement", names_prefix = "Var.", names_sep = "." ) production <- expand.grid( product = c("A", "B"), country = c("AI", "EI"), year = 2000:2014 ) %>% filter((product == "A" & country == "AI") | product == "B") %>% mutate(production = rnorm(nrow(.))) pivot_wider( production, names_from = c("product", "country"), values_from = "production", names_glue = "prod_{product}_{country}" )
This is a direct replacement for [[.data.frame
.
pull(.data, var = -1)
pull(.data, var = -1)
.data |
A |
var |
A variable specified as:
The default returns the last column (on the assumption that's the column you've created most recently). |
mtcars %>% pull(-1) mtcars %>% pull(1) mtcars %>% pull(cyl) mtcars %>% pull("cyl")
mtcars %>% pull(-1) mtcars %>% pull(1) mtcars %>% pull(cyl) mtcars %>% pull("cyl")
This is a vectorised version of switch()
: you can replace numeric
values based on their position or their name,
and character
or factor
values only by their name. This is an S3 generic: {poorman}
provides methods for
numeric
, character
, and factor
s. For logical
vectors, use if_else()
. For more complicated criteria, use
case_when()
.
You can use recode()
directly with factor
s; it will preserve the existing order of levels while changing the
values. Alternatively, you can use recode_factor()
, which will change the order of levels to match the order of
replacements.
This is a direct port of the dplyr::recode()
function.
recode(.x, ..., .default = NULL, .missing = NULL) recode_factor(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE)
recode(.x, ..., .default = NULL, .missing = NULL) recode_factor(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE)
.x |
A vector to modify |
... |
Replacements. For When named, the argument names should be the current values to be replaced, and the argument values should be the new (replacement) values. All replacements must be the same type, and must have either length one or the same length as |
.default |
If supplied, all values not otherwise matched will be given this value. If not supplied and if the
replacements are the same type as the original values in
|
.missing |
If supplied, any missing values in |
.ordered |
|
A vector the same length as .x
, and the same type as
the first of ...
, .default
, or .missing
.
recode_factor()
returns a factor whose levels are in the same order as
in ...
. The levels in .default
and .missing
come last.
na_if()
to replace specified values with a NA
.
coalesce()
to replace missing values with a specified value.
replace_na()
to replace NA
with a value.
# For character values, recode values with named arguments only. Unmatched # values are unchanged. char_vec <- sample(c("a", "b", "c"), 10, replace = TRUE) recode(char_vec, a = "Apple") recode(char_vec, a = "Apple", b = "Banana") # Use .default as replacement for unmatched values. Note that NA and # replacement values need to be of the same type. recode(char_vec, a = "Apple", b = "Banana", .default = NA_character_) # Throws an error as NA is logical, not character. ## Not run: recode(char_vec, a = "Apple", b = "Banana", .default = NA) ## End(Not run) # For numeric values, named arguments can also be used num_vec <- c(1:4, NA) recode(num_vec, `2` = 20L, `4` = 40L) # Or if you don't name the arguments, recode() matches by position. # (Only works for numeric vector) recode(num_vec, "a", "b", "c", "d") # .x (position given) looks in (...), then grabs (... value at position) # so if nothing at position (here 5), it uses .default or NA. recode(c(1, 5, 3), "a", "b", "c", "d", .default = "nothing") # Note that if the replacements are not compatible with .x, # unmatched values are replaced by NA and a warning is issued. recode(num_vec, `2` = "b", `4` = "d") # use .default to change the replacement value recode(num_vec, "a", "b", "c", .default = "other") # use .missing to replace missing values in .x recode(num_vec, "a", "b", "c", .default = "other", .missing = "missing") # For factor values, use only named replacements # and supply default with levels() factor_vec <- factor(c("a", "b", "c")) recode(factor_vec, a = "Apple", .default = levels(factor_vec)) # Use recode_factor() to create factors with levels ordered as they # appear in the recode call. The levels in .default and .missing # come last. recode_factor(num_vec, `1` = "z", `2` = "y", `3` = "x") recode_factor(num_vec, `1` = "z", `2` = "y", `3` = "x", .default = "D") recode_factor(num_vec, `1` = "z", `2` = "y", `3` = "x", .default = "D", .missing = "M") # When the input vector is a compatible vector (character vector or # factor), it is reused as default. recode_factor(letters[1:3], b = "z", c = "y") recode_factor(factor(letters[1:3]), b = "z", c = "y")
# For character values, recode values with named arguments only. Unmatched # values are unchanged. char_vec <- sample(c("a", "b", "c"), 10, replace = TRUE) recode(char_vec, a = "Apple") recode(char_vec, a = "Apple", b = "Banana") # Use .default as replacement for unmatched values. Note that NA and # replacement values need to be of the same type. recode(char_vec, a = "Apple", b = "Banana", .default = NA_character_) # Throws an error as NA is logical, not character. ## Not run: recode(char_vec, a = "Apple", b = "Banana", .default = NA) ## End(Not run) # For numeric values, named arguments can also be used num_vec <- c(1:4, NA) recode(num_vec, `2` = 20L, `4` = 40L) # Or if you don't name the arguments, recode() matches by position. # (Only works for numeric vector) recode(num_vec, "a", "b", "c", "d") # .x (position given) looks in (...), then grabs (... value at position) # so if nothing at position (here 5), it uses .default or NA. recode(c(1, 5, 3), "a", "b", "c", "d", .default = "nothing") # Note that if the replacements are not compatible with .x, # unmatched values are replaced by NA and a warning is issued. recode(num_vec, `2` = "b", `4` = "d") # use .default to change the replacement value recode(num_vec, "a", "b", "c", .default = "other") # use .missing to replace missing values in .x recode(num_vec, "a", "b", "c", .default = "other", .missing = "missing") # For factor values, use only named replacements # and supply default with levels() factor_vec <- factor(c("a", "b", "c")) recode(factor_vec, a = "Apple", .default = levels(factor_vec)) # Use recode_factor() to create factors with levels ordered as they # appear in the recode call. The levels in .default and .missing # come last. recode_factor(num_vec, `1` = "z", `2` = "y", `3` = "x") recode_factor(num_vec, `1` = "z", `2` = "y", `3` = "x", .default = "D") recode_factor(num_vec, `1` = "z", `2` = "y", `3` = "x", .default = "D", .missing = "M") # When the input vector is a compatible vector (character vector or # factor), it is reused as default. recode_factor(letters[1:3], b = "z", c = "y") recode_factor(factor(letters[1:3]), b = "z", c = "y")
Use relocate()
to change column positions, using the same syntax as select()
to make it easy to move blocks of
columns at once.
relocate(.data, ..., .before = NULL, .after = NULL)
relocate(.data, ..., .before = NULL, .after = NULL)
.data |
A |
... |
< |
.before , .after
|
< |
An object of the same type as .data
. The output has the following properties:
Rows are not affected.
The same columns appear in the output, but (usually) in a different place.
Data frame attributes are preserved.
Groups are not affected.
df <- data.frame( a = 1, b = 1, c = 1, d = "a", e = "a", f = "a", stringsAsFactors = FALSE ) df %>% relocate(f) df %>% relocate(a, .after = c) df %>% relocate(f, .before = b) df %>% relocate(a, .after = last_col()) # Can also select variables based on their type df %>% relocate(where(is.character)) df %>% relocate(where(is.numeric), .after = last_col()) # Or with any other select helper df %>% relocate(any_of(c("a", "e", "i", "o", "u"))) # When .before or .after refers to multiple variables they will be # moved to be immediately before/after the selected variables. df2 <- data.frame( a = 1, b = "a", c = 1, d = "a", stringsAsFactors = FALSE ) df2 %>% relocate(where(is.numeric), .after = where(is.character)) df2 %>% relocate(where(is.numeric), .before = where(is.character))
df <- data.frame( a = 1, b = 1, c = 1, d = "a", e = "a", f = "a", stringsAsFactors = FALSE ) df %>% relocate(f) df %>% relocate(a, .after = c) df %>% relocate(f, .before = b) df %>% relocate(a, .after = last_col()) # Can also select variables based on their type df %>% relocate(where(is.character)) df %>% relocate(where(is.numeric), .after = last_col()) # Or with any other select helper df %>% relocate(any_of(c("a", "e", "i", "o", "u"))) # When .before or .after refers to multiple variables they will be # moved to be immediately before/after the selected variables. df2 <- data.frame( a = 1, b = "a", c = 1, d = "a", stringsAsFactors = FALSE ) df2 %>% relocate(where(is.numeric), .after = where(is.character)) df2 %>% relocate(where(is.numeric), .before = where(is.character))
rename()
changes the names of individual variables using new_name = old_name
syntax.
rename_with()
renames columns using a function.
rename(.data, ...) rename_with(.data, .fn, .cols = everything(), ...)
rename(.data, ...) rename_with(.data, .fn, .cols = everything(), ...)
.data |
A |
... |
For For |
.fn |
A |
.cols |
Columns to rename; defaults to all columns. |
A data.frame
with the following properties:
Rows are not affected.
Column names are changed; column order is preserved.
data.frame
attributes are preserved.
Groups are updated to reflect new names.
rename(mtcars, MilesPerGallon = mpg) rename(mtcars, Cylinders = cyl, Gears = gear) mtcars %>% rename(MilesPerGallon = mpg) rename_with(mtcars, toupper) rename_with(mtcars, toupper, starts_with("c"))
rename(mtcars, MilesPerGallon = mpg) rename(mtcars, Cylinders = cyl, Gears = gear) mtcars %>% rename(MilesPerGallon = mpg) rename_with(mtcars, toupper) rename_with(mtcars, toupper, starts_with("c"))
Replace missing values in a data.frame
or vector
.
replace_na(data, replace, ...)
replace_na(data, replace, ...)
data |
A |
replace |
If |
... |
Additional arguments passed onto methods; not currently used. |
If data
is a data.frame
, replace_na()
returns a data.frame
. If data
is a vector
, replace_na()
returns a
vector
of class determined by the union of data
and replace
.
na_if()
to replace specified values with a NA
.
coalesce()
to replace missing values within subsequent vector
(s) of value(s).
df <- data.frame(x = c(1, 2, NA), y = c("a", NA, "b"), stringsAsFactors = FALSE) df %>% replace_na(list(x = 0, y = "unknown")) df %>% mutate(x = replace_na(x, 0)) df$x %>% replace_na(0) df$y %>% replace_na("unknown")
df <- data.frame(x = c(1, 2, NA), y = c("a", NA, "b"), stringsAsFactors = FALSE) df %>% replace_na(list(x = 0, y = "unknown")) df %>% mutate(x = replace_na(x, 0)) df$x %>% replace_na(0) df$y %>% replace_na("unknown")
In some quarters, it is considered best to avoid row names, because they are effectively a character column with
different semantics than every other column.
These functions allow to you detect if a data.frame
has row names (has_rownames()
), remove them
(remove_rownames()
), or convert them back-and-forth between an explicit column (rownames_to_column()
and
column_to_rownames()
). Also included is rowid_to_column()
, which adds a column at the start of the dataframe of
ascending sequential row ids starting at 1. Note that this will remove any existing row names.
rownames_to_column(.data, var = "rowname") rowid_to_column(.data, var = "rowid") column_to_rownames(.data, var = "rowname") remove_rownames(.data) has_rownames(.data)
rownames_to_column(.data, var = "rowname") rowid_to_column(.data, var = "rowid") column_to_rownames(.data, var = "rowname") remove_rownames(.data) has_rownames(.data)
.data |
A |
var |
|
column_to_rownames()
always returns a data.frame
.
has_rownames()
returns a logical(1)
.
All other functions return an object of the same class as the input.
# Detect row names has_rownames(mtcars) has_rownames(iris) # Remove row names remove_rownames(mtcars) %>% has_rownames() # Convert between row names and column mtcars <- rownames_to_column(mtcars, var = "car") column_to_rownames(mtcars, var = "car") %>% head() # Adding rowid as a column rowid_to_column(iris) %>% head()
# Detect row names has_rownames(mtcars) has_rownames(iris) # Remove row names remove_rownames(mtcars) %>% has_rownames() # Convert between row names and column mtcars <- rownames_to_column(mtcars, var = "car") column_to_rownames(mtcars, var = "car") %>% head() # Adding rowid as a column rowid_to_column(iris) %>% head()
Select (and optionally rename) variables in a data.frame
, using a concise mini-language that makes it easy to refer
to variables based on their name (e.g. a:f
selects all columns from a
on the left to f
on the right). You can
also use predicate functions like is.numeric()
to select variables based on their properties.
select(.data, ...)
select(.data, ...)
.data |
A |
... |
< |
poorman selections implement a dialect of R where operators make it easy to select variables:
:
for selecting a range of consecutive variables.
!
for taking the complement of a set of variables.
&
and |
for selecting the intersection or the union of two sets of variables.
c()
for combining selections.
In addition, you can use selection helpers. Some helpers select specific columns:
everything()
: Matches all variables.
last_col()
: Select last variable, possibly with an offset.
These helpers select variables by matching patterns in their names:
starts_with()
: Starts with a prefix.
ends_with()
: Ends with a suffix.
contains()
: Contains a literal string.
matches()
: Matches a regular expression.
num_range()
: Matches a numerical range like x01
, x02
, x03
.
These helpers select variables from a character vector:
all_of()
: Matches variable names in a character vector. All names must be present, otherwise an out-of-bounds
error is thrown.
any_of()
: Same as all_of()
, except that no error is thrown for names that don't exist.
This helper selects variables with a function:
where()
: Applies a function to all variables and selects those for which the function returns TRUE
.
An object of the same type as .data
. The output has the following properties:
Rows are not affected.
Output columns are a subset of input columns, potentially with a different order. Columns will be renamed if
new_name = old_name
form is used.
Data frame attributes are preserved.
Groups are maintained; you can't select off grouping variables.
# Here we show the usage for the basic selection operators. See the # specific help pages to learn about helpers like [starts_with()]. # Select variables by name: mtcars %>% select(mpg) # Select multiple variables by separating them with commas. Note # how the order of columns is determined by the order of inputs: mtcars %>% select(disp, gear, am) # Rename variables: mtcars %>% select(MilesPerGallon = mpg, everything()) # The `:` operator selects a range of consecutive variables: select(mtcars, mpg:cyl) # The `!` operator negates a selection: mtcars %>% select(!(mpg:qsec)) mtcars %>% select(!ends_with("p")) # `&` and `|` take the intersection or the union of two selections: iris %>% select(starts_with("Petal") & ends_with("Width")) iris %>% select(starts_with("Petal") | ends_with("Width")) # To take the difference between two selections, combine the `&` and # `!` operators: iris %>% select(starts_with("Petal") & !ends_with("Width"))
# Here we show the usage for the basic selection operators. See the # specific help pages to learn about helpers like [starts_with()]. # Select variables by name: mtcars %>% select(mpg) # Select multiple variables by separating them with commas. Note # how the order of columns is determined by the order of inputs: mtcars %>% select(disp, gear, am) # Rename variables: mtcars %>% select(MilesPerGallon = mpg, everything()) # The `:` operator selects a range of consecutive variables: select(mtcars, mpg:cyl) # The `!` operator negates a selection: mtcars %>% select(!(mpg:qsec)) mtcars %>% select(!ends_with("p")) # `&` and `|` take the intersection or the union of two selections: iris %>% select(starts_with("Petal") & ends_with("Width")) iris %>% select(starts_with("Petal") | ends_with("Width")) # To take the difference between two selections, combine the `&` and # `!` operators: iris %>% select(starts_with("Petal") & !ends_with("Width"))
These functions allow you to select variables based on their names.
starts_with()
: Starts with a prefix.
ends_with()
: Ends with a prefix.
contains()
: Contains a literal string.
matches()
: Matches a regular expression.
all_of()
: Matches variable names in a character vector. All names must be present, otherwise an error is thrown.
any_of()
: The same as all_of()
except it doesn't throw an error.
everything()
: Matches all variables.
last_col()
: Select the last variable, possibly with an offset.
starts_with(match, ignore.case = TRUE, vars = peek_vars()) ends_with(match, ignore.case = TRUE, vars = peek_vars()) contains(match, ignore.case = TRUE, vars = peek_vars()) matches(match, ignore.case = TRUE, perl = FALSE, vars = peek_vars()) num_range(prefix, range, width = NULL, vars = peek_vars()) all_of(x, vars = peek_vars()) any_of(x, vars = peek_vars()) everything(vars = peek_vars()) last_col(offset = 0L, vars = peek_vars())
starts_with(match, ignore.case = TRUE, vars = peek_vars()) ends_with(match, ignore.case = TRUE, vars = peek_vars()) contains(match, ignore.case = TRUE, vars = peek_vars()) matches(match, ignore.case = TRUE, perl = FALSE, vars = peek_vars()) num_range(prefix, range, width = NULL, vars = peek_vars()) all_of(x, vars = peek_vars()) any_of(x, vars = peek_vars()) everything(vars = peek_vars()) last_col(offset = 0L, vars = peek_vars())
match |
|
ignore.case |
|
vars |
|
perl |
|
prefix |
A prefix which starts the numeric range. |
range |
|
width |
|
x |
|
offset |
|
An integer vector giving the position of the matched variables.
select()
, relocate()
, where()
, group_cols()
mtcars %>% select(starts_with("c")) mtcars %>% select(starts_with(c("c", "h"))) mtcars %>% select(ends_with("b")) mtcars %>% relocate(contains("a"), .before = mpg) iris %>% select(matches(".t.")) mtcars %>% select(last_col()) # `all_of()` selects the variables in a character vector: iris %>% select(all_of(c("Petal.Length", "Petal.Width"))) # `all_of()` is strict and will throw an error if the column name isn't found try({iris %>% select(all_of(c("Species", "Genres")))}) # However `any_of()` allows missing variables iris %>% select(any_of(c("Species", "Genres")))
mtcars %>% select(starts_with("c")) mtcars %>% select(starts_with(c("c", "h"))) mtcars %>% select(ends_with("b")) mtcars %>% relocate(contains("a"), .before = mpg) iris %>% select(matches(".t.")) mtcars %>% select(last_col()) # `all_of()` selects the variables in a character vector: iris %>% select(all_of(c("Petal.Length", "Petal.Width"))) # `all_of()` is strict and will throw an error if the column name isn't found try({iris %>% select(all_of(c("Species", "Genres")))}) # However `any_of()` allows missing variables iris %>% select(any_of(c("Species", "Genres")))
Subset rows by their original position in the data.frame
. Grouped data.frame
s use the position within each group.
slice(.data, ...) slice_head(.data, ..., n, prop) slice_tail(.data, ..., n, prop) slice_min(.data, order_by, ..., n, prop, with_ties = TRUE) slice_max(.data, order_by, ..., n, prop, with_ties = TRUE) slice_sample(.data, ..., n, prop, weight_by = NULL, replace = FALSE)
slice(.data, ...) slice_head(.data, ..., n, prop) slice_tail(.data, ..., n, prop) slice_min(.data, order_by, ..., n, prop, with_ties = TRUE) slice_max(.data, order_by, ..., n, prop, with_ties = TRUE) slice_sample(.data, ..., n, prop, weight_by = NULL, replace = FALSE)
.data |
A |
... |
For Provide either positive values to keep, or negative values to drop. The values provided must be either all positive or negative. Indices beyond the number of rows in the input are silently ignored. |
n , prop
|
Provide either If |
order_by |
The variable to order by. |
with_ties |
|
weight_by |
Sampling weights. This must evaluate to a vector of non-negative numbers the same length as the input. Weights are automatically standardised to sum to 1. |
replace |
|
An object of the same type as .data
. The output has the following properties:
Each row may appear 0, 1, or many times in the output.
Columns are not modified.
Groups are not modified.
Data frame attributes are preserved.
slice(mtcars, c(1, 2, 3)) mtcars %>% slice(1:3) # Similar to head(mtcars, 1) mtcars %>% slice(1L) # Similar to tail(mtcars, 1): mtcars %>% slice(n()) mtcars %>% slice(5:n()) # Rows can be dropped with negative indices: slice(mtcars, -(1:4)) # First and last rows based on existing order mtcars %>% slice_head(n = 5) mtcars %>% slice_tail(n = 5) # Grouped operations: mtcars %>% group_by(am, cyl, gear) %>% slice_head(n = 2)
slice(mtcars, c(1, 2, 3)) mtcars %>% slice(1:3) # Similar to head(mtcars, 1) mtcars %>% slice(1L) # Similar to tail(mtcars, 1): mtcars %>% slice(n()) mtcars %>% slice(5:n()) # Rows can be dropped with negative indices: slice(mtcars, -(1:4)) # First and last rows based on existing order mtcars %>% slice_head(n = 5) mtcars %>% slice_tail(n = 5) # Grouped operations: mtcars %>% group_by(am, cyl, gear) %>% slice_head(n = 2)
Create one or more scalar variables summarising the variables of an existing data.frame
. Grouped data.frame
s will
result in one row in the output for each group.
summarise(.data, ..., .groups = NULL) summarize(.data, ..., .groups = NULL)
summarise(.data, ..., .groups = NULL) summarize(.data, ..., .groups = NULL)
.data |
A |
... |
Name-value pairs of summary functions. The name will be the name of the variable in the result. |
.groups |
When
In addition, a message informs you of that choice, unless the result is ungrouped, the option
The value can be:
|
summarise()
and summarize()
are synonyms.
# A summary applied to ungrouped tbl returns a single row mtcars %>% summarise(mean = mean(disp), n = n()) # Usually, you'll want to group first mtcars %>% group_by(cyl) %>% summarise(mean = mean(disp), n = n()) # You can summarise to more than one value: mtcars %>% group_by(cyl) %>% summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) # You use a data frame to create multiple columns so you can wrap # this up into a function: my_quantile <- function(x, probs) { data.frame(x = quantile(x, probs), probs = probs) } mtcars %>% group_by(cyl) %>% summarise(my_quantile(disp, c(0.25, 0.75))) # Each summary call removes one grouping level (since that group # is now just a single row) mtcars %>% group_by(cyl, vs) %>% summarise(cyl_n = n()) %>% group_vars()
# A summary applied to ungrouped tbl returns a single row mtcars %>% summarise(mean = mean(disp), n = n()) # Usually, you'll want to group first mtcars %>% group_by(cyl) %>% summarise(mean = mean(disp), n = n()) # You can summarise to more than one value: mtcars %>% group_by(cyl) %>% summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) # You use a data frame to create multiple columns so you can wrap # this up into a function: my_quantile <- function(x, probs) { data.frame(x = quantile(x, probs), probs = probs) } mtcars %>% group_by(cyl) %>% summarise(my_quantile(disp, c(0.25, 0.75))) # Each summary call removes one grouping level (since that group # is now just a single row) mtcars %>% group_by(cyl, vs) %>% summarise(cyl_n = n()) %>% group_vars()
Union all elements of R objects together.
union_all(x, y, ...)
union_all(x, y, ...)
x , y
|
objects to union all elements of (ignoring order) |
... |
other arguments passed on to methods |
first <- mtcars[1:20, ] second <- mtcars[10:32, ] union_all(first, second) # union_all does not remove duplicates a <- data.frame(column = c(1:10, 10)) b <- data.frame(column = c(1:5, 5)) union_all(a, b)
first <- mtcars[1:20, ] second <- mtcars[10:32, ] union_all(first, second) # union_all does not remove duplicates a <- data.frame(column = c(1:10, 10)) b <- data.frame(column = c(1:5, 5)) union_all(a, b)
Convenience function to paste together multiple columns.
unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
data |
A |
col |
|
... |
The columns to unite. |
sep |
|
remove |
|
na.rm |
|
A data.frame
with the columns passed via ...
pasted together in a new column.
df <- data.frame(x = c("a", "a", NA, NA), y = c("b", NA, "b", NA)) df df %>% unite("z", x:y, remove = FALSE) # To remove missing values: df %>% unite("z", x:y, na.rm = TRUE, remove = FALSE)
df <- data.frame(x = c("a", "a", NA, NA), y = c("b", NA, "b", NA)) df df %>% unite("z", x:y, remove = FALSE) # To remove missing values: df %>% unite("z", x:y, na.rm = TRUE, remove = FALSE)
This selection helper selects the variables for which a function returns TRUE
.
where(fn)
where(fn)
fn |
A function that returns |
A vector of integer
column positions which are the result of the fn
evaluation.
iris %>% select(where(is.numeric)) iris %>% select(where(function(x) is.numeric(x))) iris %>% select(where(function(x) is.numeric(x) && mean(x) > 3.5))
iris %>% select(where(is.numeric)) iris %>% select(where(function(x) is.numeric(x))) iris %>% select(where(function(x) is.numeric(x) && mean(x) > 3.5))
Six variations on ranking functions, mimicking the ranking functions described in SQL2003. They are currently
implemented using the built in rank()
function. All ranking functions map smallest inputs to smallest outputs. Use
desc()
to reverse the direction.
cume_dist(x) dense_rank(x) min_rank(x) ntile(x = row_number(), n) percent_rank(x) row_number(x)
cume_dist(x) dense_rank(x) min_rank(x) ntile(x = row_number(), n) percent_rank(x) row_number(x)
x |
A vector of values to rank. Missing values are left as is. If you want to treat them as the smallest or
largest values, replace with |
n |
|
cume_dist()
: a cumulative distribution function. Proportion of all values less than or equal to the current rank.
dense_rank()
: like min_rank()
, but with no gaps between ranks
min_rank()
: equivalent to rank(ties.method = "min")
ntile()
: a rough rank, which breaks the input vector into n
buckets. The size of the buckets may differ by up
to one, larger buckets have lower rank.
percent_rank()
: a number between 0
and 1
computed by rescaling min_rank
to [0, 1]
row_number()
: equivalent to rank(ties.method = "first")
x <- c(5, 1, 3, 2, 2, NA) row_number(x) min_rank(x) dense_rank(x) percent_rank(x) cume_dist(x) ntile(x, 2) ntile(1:8, 3) # row_number can be used with single table verbs without specifying x # (for data frames and databases that support windowing) mutate(mtcars, row_number() == 1L) mtcars %>% filter(between(row_number(), 1, 10))
x <- c(5, 1, 3, 2, 2, NA) row_number(x) min_rank(x) dense_rank(x) percent_rank(x) cume_dist(x) ntile(x, 2) ntile(1:8, 3) # row_number can be used with single table verbs without specifying x # (for data frames and databases that support windowing) mutate(mtcars, row_number() == 1L) mtcars %>% filter(between(row_number(), 1, 10))
This function allows you to modify the grouping variables for a single operation.
with_groups(.data, .groups, .f, ...)
with_groups(.data, .groups, .f, ...)
.data |
A |
.groups |
< Use |
.f |
A |
... |
Additional arguments passed on to |
df <- data.frame(g = c(1, 1, 2, 2, 3), x = runif(5)) df %>% with_groups(g, mutate, x_mean = mean(x)) df %>% with_groups(g, ~ mutate(.x, x_mean = mean(x))) df %>% group_by(g) %>% with_groups(NULL, mutate, x_mean = mean(x)) # NB: grouping can't be restored if you remove the grouping variables df %>% group_by(g) %>% with_groups(NULL, mutate, g = NULL)
df <- data.frame(g = c(1, 1, 2, 2, 3), x = runif(5)) df %>% with_groups(g, mutate, x_mean = mean(x)) df %>% with_groups(g, ~ mutate(.x, x_mean = mean(x))) df %>% group_by(g) %>% with_groups(NULL, mutate, x_mean = mean(x)) # NB: grouping can't be restored if you remove the grouping variables df %>% group_by(g) %>% with_groups(NULL, mutate, g = NULL)