Group by one or more variables

Most data operations are done on groups defined by variables. group_by() takes an existing tbl and converts it into a grouped tbl where operations are performed "by group". ungroup() removes grouping.

group_by(.data, ..., add = FALSE, .drop = group_by_drop_default(.data))

ungroup(x, ...)

Arguments

.data	a tbl
...	Variables to group by. All tbls accept variable names. Some tbls will accept functions of variables. Duplicated groups will be silently dropped.
add	When `add = FALSE`, the default, `group_by()` will override existing groups. To add to the existing groups, use `add = TRUE`.
.drop	When `.drop = TRUE`, empty groups are dropped. See `group_by_drop_default()` for what the default value is for this argument.
x	A `tbl()`

Value

A grouped data frame, unless the combination of ... and add yields a non empty set of grouping columns, a regular (ungrouped) data frame otherwise.

Tbl types

group_by() is an S3 generic with methods for the three built-in tbls. See the help for the corresponding classes and their manip methods for more details:

data.frame: grouped_df
data.table: dtplyr::grouped_dt
SQLite: src_sqlite()
PostgreSQL: src_postgres()
MySQL: src_mysql()

Scoped grouping

The three scoped variants (group_by_all(), group_by_if() and group_by_at()) make it easy to group a dataset by a selection of variables.

Examples

by_cyl <- mtcars %>% group_by(cyl)

# grouping doesn't change how the data looks (apart from listing
# how it's grouped):
by_cyl
#> # A tibble: 32 x 11
#> # Groups:   cyl [3]
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>  * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> # … with 22 more rows

# It changes how it acts with the other dplyr verbs:
by_cyl %>% summarise(
  disp = mean(disp),
  hp = mean(hp)
)
#> # A tibble: 3 x 3
#>     cyl  disp    hp
#>   <dbl> <dbl> <dbl>
#> 1     4  105.  82.6
#> 2     6  183. 122. 
#> 3     8  353. 209. 
by_cyl %>% filter(disp == max(disp))
#> # A tibble: 3 x 11
#> # Groups:   cyl [3]
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#> 2  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#> 3  10.4     8  472    205  2.93  5.25  18.0     0     0     3     4

# Each call to summarise() removes a layer of grouping
by_vs_am <- mtcars %>% group_by(vs, am)
by_vs <- by_vs_am %>% summarise(n = n())
by_vs
#> # A tibble: 4 x 3
#> # Groups:   vs [2]
#>      vs    am     n
#>   <dbl> <dbl> <int>
#> 1     0     0    12
#> 2     0     1     6
#> 3     1     0     7
#> 4     1     1     7
by_vs %>% summarise(n = sum(n))
#> # A tibble: 2 x 2
#>      vs     n
#>   <dbl> <int>
#> 1     0    18
#> 2     1    14

# To removing grouping, use ungroup
by_vs %>%
  ungroup() %>%
  summarise(n = sum(n))
#> # A tibble: 1 x 1
#>       n
#>   <int>
#> 1    32

# You can group by expressions: this is just short-hand for
# a mutate/rename followed by a simple group_by
mtcars %>% group_by(vsam = vs + am)
#> # A tibble: 32 x 12
#> # Groups:   vsam [3]
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb  vsam
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4     1
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4     1
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1     2
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1     1
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2     0
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1     1
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4     0
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2     1
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2     1
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4     1
#> # … with 22 more rows

# By default, group_by overrides existing grouping
by_cyl %>%
  group_by(vs, am) %>%
  group_vars()
#> [1] "vs" "am"

# Use add = TRUE to instead append
by_cyl %>%
  group_by(vs, am, add = TRUE) %>%
  group_vars()
#> [1] "cyl" "vs"  "am" 

# when factors are involved, groups can be empty
tbl <- tibble(
  x = 1:10,
  y = factor(rep(c("a", "c"), each  = 5), levels = c("a", "b", "c"))
)
tbl %>%
  group_by(y) %>%
  group_rows()
#> [[1]]
#> [1] 1 2 3 4 5
#> 
#> [[2]]
#> [1]  6  7  8  9 10
#>

Arguments

Value

Tbl types

Scoped grouping

See also

Examples

Contents