# Packages: -------------------------------------------------------------------
library(tidyverse)
library(dbplyr)
library(Lahman)
library(data.table)
#lapply( c('tidyverse', 'RSQLite', 'dbplyr', 'Lahman', 'data.table'),
#           install.packages)
# Create a local SQLlite database of the Lahman data: ----------------------
#! If this fails, run: install.packages('RSQLite')
lahman = lahman_sqlite()
# Copy the batting table to memory as a tibble: -------------------------------
batting_tbl = lahman %>% 
  tbl("BATTING") %>% 
  collect()
class(batting_tbl)[1] "tbl_df"     "tbl"        "data.frame"
# Convert the copy in memory to a data.table: ---------------------------------
batting_dt = as.data.table(batting_tbl)
class(batting_dt)[1] "data.table" "data.frame"
The following points from the datatable-faq: 1.5, 1.6, 2.1, 2.3, 2.16, 2.17, 2.21, 2.23
j         playerID yearID league stint
     1: abercda01   1871     NA     1
     2:  addybo01   1871     NA     1
     3: allisar01   1871     NA     1
     4: allisdo01   1871     NA     1
     5: ansonca01   1871     NA     1
    ---                              
102812: zimmejo02   2016     AL     1
102813: zimmery01   2016     NL     1
102814: zobribe01   2016     NL     1
102815: zuninmi01   2016     AL     1
102816:  zychto01   2016     AL     1
The j clause in a data.table is used to select columns. The .() is an alias for list() within a data.table. Each element of the list becomes a column in the resulting data.table. We can provide new names for the columns by naming the list elements.
#SQL: SELECT
query = 
'
SELECT playerID, yearID, lgID as league, stint
FROM BATTING
'
lahman %>% tbl(sql(query)) # Source:   SQL [?? x 4]
# Database: sqlite 3.22.0
#   [/var/folders/t6/m1sbk4dn10181ns0jx_ss0s00000gn/T//RtmpcQyRLQ/lahman.sqlite]
   playerID  yearID league stint
   <chr>      <int> <chr>  <int>
 1 abercda01   1871 NA         1
 2 addybo01    1871 NA         1
 3 allisar01   1871 NA         1
 4 allisdo01   1871 NA         1
 5 ansonca01   1871 NA         1
 6 armstbo01   1871 NA         1
 7 barkeal01   1871 NA         1
 8 barnero01   1871 NA         1
 9 barrebi01   1871 NA         1
10 barrofr01   1871 NA         1
# … with more rows
# dplyr: select, rename, transmute
batting_tbl %>%
    transmute(playerID, yearID, league = lgID, stint)# A tibble: 102,816 x 4
   playerID  yearID league stint
   <chr>      <int> <chr>  <int>
 1 abercda01   1871 NA         1
 2 addybo01    1871 NA         1
 3 allisar01   1871 NA         1
 4 allisdo01   1871 NA         1
 5 ansonca01   1871 NA         1
 6 armstbo01   1871 NA         1
 7 barkeal01   1871 NA         1
 8 barnero01   1871 NA         1
 9 barrebi01   1871 NA         1
10 barrofr01   1871 NA         1
# … with 102,806 more rows
# A tibble: 102,816 x 4
   playerID  yearID league stint
   <chr>      <int> <chr>  <int>
 1 abercda01   1871 NA         1
 2 addybo01    1871 NA         1
 3 allisar01   1871 NA         1
 4 allisdo01   1871 NA         1
 5 ansonca01   1871 NA         1
 6 armstbo01   1871 NA         1
 7 barkeal01   1871 NA         1
 8 barnero01   1871 NA         1
 9 barrebi01   1871 NA         1
10 barrofr01   1871 NA         1
# … with 102,806 more rows
j         playerID       avg
     1: abercda01 0.0000000
     2:  addybo01 0.2711864
     3: allisar01 0.2919708
     4: allisdo01 0.3308271
     5: ansonca01 0.3250000
    ---                    
102812: zimmejo02 0.2500000
102813: zimmery01 0.2177986
102814: zobribe01 0.2715105
102815: zuninmi01 0.2073171
102816:  zychto01       NaN
Any valid R expression can be used in “j” and we may reference the columns as variables. This is similar to the behavior of the R function with(). As the long as the expression returns a list the result is a data.table.
#SQL: 
query = 
'
SELECT playerID, 
       (Cast (H as Float) /  Cast (AB as Float) ) as avg
FROM BATTING
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 102,816 x 2
   playerID    avg
   <chr>     <dbl>
 1 abercda01 0    
 2 addybo01  0.271
 3 allisar01 0.292
 4 allisdo01 0.331
 5 ansonca01 0.325
 6 armstbo01 0.224
 7 barkeal01 0.25 
 8 barnero01 0.401
 9 barrebi01 0.2  
10 barrofr01 0.151
# … with 102,806 more rows
# A tibble: 102,816 x 2
   playerID    avg
   <chr>     <dbl>
 1 abercda01 0    
 2 addybo01  0.271
 3 allisar01 0.292
 4 allisdo01 0.331
 5 ansonca01 0.325
 6 armstbo01 0.224
 7 barkeal01 0.25 
 8 barnero01 0.401
 9 barrebi01 0.2  
10 barrofr01 0.151
# … with 102,806 more rows
j   max_HBP
1:      51
Here we find the maximum HBP entry in the table using a function that returns a single value.
# SQL: SELECT with aggregation function
query =
'
SELECT max(HBP) as max_hbp
FROM BATTING
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 1 x 1
  max_hbp
    <int>
1      51
# A tibble: 1 x 1
  max_HBP
    <dbl>
1      51
byby# data.table
#batting_dt[ , .(avg = sum(H) / sum(AB)), by = .(playerID, yearID, lgID)]
batting_dt[ , .(avg = sum(H) / sum(AB)), .(playerID, yearID, lgID)]        playerID yearID lgID       avg
    1: abercda01   1871   NA 0.0000000
    2:  addybo01   1871   NA 0.2711864
    3: allisar01   1871   NA 0.2919708
    4: allisdo01   1871   NA 0.3308271
    5: ansonca01   1871   NA 0.3250000
   ---                                
97776: zimmejo02   2016   AL 0.2500000
97777: zimmery01   2016   NL 0.2177986
97778: zobribe01   2016   NL 0.2715105
97779: zuninmi01   2016   AL 0.2073171
97780:  zychto01   2016   AL       NaN
We can perform grouped operations in j using the by argument. Use a list to form groups using the unique combinations from multiple columns. Observe that unlike the SQL and dplyr versions the resulting table is not re-ordered by group.
keyby# data.table: keyby
# Here the parameter name is required
batting_dt[ , .(avg = sum(H) / sum(AB)), keyby = .(playerID, yearID, lgID)]        playerID yearID lgID       avg
    1: aardsda01   2004   NL       NaN
    2: aardsda01   2006   NL 0.0000000
    3: aardsda01   2007   AL       NaN
    4: aardsda01   2008   AL 0.0000000
    5: aardsda01   2009   AL       NaN
   ---                                
97776: zwilldu01   1914   FL 0.3125000
97777: zwilldu01   1915   FL 0.2864964
97778: zwilldu01   1916   NL 0.1132075
97779:  zychto01   2015   AL       NaN
97780:  zychto01   2016   AL       NaN
If we use keyby the new data.table is reordered by group.
#SQL: 
query = 
'
SELECT playerID, yearID, lgID,
       ( Cast (sum(H) as Float) / Cast (sum(AB) as Float) ) as avg
FROM BATTING
GROUP BY playerID, yearID, lgID
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 97,780 x 4
   playerID  yearID lgID     avg
   <chr>      <int> <chr>  <dbl>
 1 aardsda01   2004 NL    NA    
 2 aardsda01   2006 NL     0    
 3 aardsda01   2007 AL    NA    
 4 aardsda01   2008 AL     0    
 5 aardsda01   2009 AL    NA    
 6 aardsda01   2010 AL    NA    
 7 aardsda01   2012 AL    NA    
 8 aardsda01   2013 NL    NA    
 9 aardsda01   2015 NL     0    
10 aaronha01   1954 NL     0.280
# … with 97,770 more rows
# dplyr: group_by
batting_tbl %>% 
  group_by(playerID, yearID, lgID) %>%
  summarize( avg = sum(H) / sum(AB) )# A tibble: 97,780 x 4
# Groups:   playerID, yearID [95,250]
   playerID  yearID lgID      avg
   <chr>      <int> <chr>   <dbl>
 1 aardsda01   2004 NL    NaN    
 2 aardsda01   2006 NL      0    
 3 aardsda01   2007 AL    NaN    
 4 aardsda01   2008 AL      0    
 5 aardsda01   2009 AL    NaN    
 6 aardsda01   2010 AL    NaN    
 7 aardsda01   2012 AL    NaN    
 8 aardsda01   2013 NL    NaN    
 9 aardsda01   2015 NL      0    
10 aaronha01   1954 NL      0.280
# … with 97,770 more rows
i       playerID HBP
   1:  abadfe01   0
   2:  abadfe01   0
   3: abreujo02  15
   4: achteaj01   0
   5: ackledu01   0
  ---              
1479: zimmejo02   0
1480: zimmery01   5
1481: zobribe01   4
1482: zuninmi01   6
1483:  zychto01   0
The i clause in a data.table is used to select rows. Similar to dplyr::filter or subsetting a data.frame we often use it with a logical index whose length matches the number of rows.
       playerID teamID HBP
   1: barreji01    DET   5
   2: buelofr01    DET   3
   3: caseydo01    DET  10
   4: crockda01    DET   2
   5: cronija01    DET   1
  ---                     
4212: verlaju01    DET   0
4213: wilsoal01    DET   0
4214: wilsobo02    DET   0
4215: wilsoju10    DET   0
4216: zimmejo02    DET   0
[1] "teamID"
       playerID yearID HBP
   1: ahmedni01   2016   4
   2: barreja01   2016   0
   3: bournmi01   2016   0
   4: brachsi01   2016   0
   5: bradlar01   2016   0
  ---                     
1479: taylomi02   2016   1
1480: treinbl01   2016   0
1481: turnetr01   2016   1
1482: werthja01   2016   4
1483: zimmery01   2016   5
    playerID yearID HBP
1: trottsa01   1884   2
If our data.table has one or more columns designated as a key, we can subset by value in i. Note the need to pass integer-valued keys within a list to differentiate from simple subsetting by integer.
When we set a key using setkey() the rows are reordered to enable faster subsetting by key. This is similar to “indexing” in SQL. Read more about keys in this vignette.
#SQL: WHERE
query = 
'
SELECT playerID, HBP
FROM BATTING
WHERE yearID = 2016
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 1,483 x 2
   playerID    HBP
   <chr>     <int>
 1 abadfe01      0
 2 abadfe01      0
 3 abreujo02    15
 4 achteaj01     0
 5 ackledu01     0
 6 adamecr01     4
 7 adamsau01     0
 8 adamsma01     2
 9 adlemti01     0
10 adriaeh01     2
# … with 1,473 more rows
# A tibble: 1,483 x 2
   playerID    HBP
   <chr>     <int>
 1 abadfe01      0
 2 abadfe01      0
 3 abreujo02    15
 4 achteaj01     0
 5 ackledu01     0
 6 adamecr01     4
 7 adamsau01     0
 8 adamsma01     2
 9 adlemti01     0
10 adriaeh01     2
# … with 1,473 more rows
# A tibble: 1,483 x 3
   playerID  yearID   HBP
   <chr>      <int> <int>
 1 abadfe01    2016     0
 2 abadfe01    2016     0
 3 abreujo02   2016    15
 4 achteaj01   2016     0
 5 ackledu01   2016     0
 6 adamecr01   2016     4
 7 adamsau01   2016     0
 8 adamsma01   2016     2
 9 adlemti01   2016     0
10 adriaeh01   2016     2
# … with 1,473 more rows
    playerID  HR
1:  dunnad01 462
2: beltrad01 403
3: ortizda01 521
4: soriaal01 409
5: pujolal01 591
6: rodrial01 507
7: cabremi01 446
8: teixema01 409
Those familiar with dplyr will be familiar with the “pipe” operation %>% that calls the function on the right with the object on the left (usually data) as its first parameter. We can take advantage of the . used by margittr::%>% to use pipes with data.table’s [] function.
    playerID  HR
1:  dunnad01 462
2: beltrad01 403
3: ortizda01 521
4: soriaal01 409
5: pujolal01 591
6: rodrial01 507
7: cabremi01 446
8: teixema01 409
You can also simply “chain” the [] statements together.
# dplyr: Use %>% to chain
batting_tbl %>% 
  filter(yearID > 2000) %>%
  group_by(playerID) %>%
  summarize( HR = sum(HR) ) %>%
  ## Here's the pipe from above
  filter( HR > 400)# A tibble: 8 x 2
  playerID     HR
  <chr>     <int>
1 beltrad01   403
2 cabremi01   446
3 dunnad01    462
4 ortizda01   521
5 pujolal01   591
6 rodrial01   507
7 soriaal01   409
8 teixema01   409
i    playerID  HR
1: pujolal01 591
2: ortizda01 521
3: rodrial01 507
4:  dunnad01 462
5: cabremi01 446
6: soriaal01 409
7: teixema01 409
8: beltrad01 403
Use order() in the i clause to sort a data.table.
help(data.table::order):“Also x[order(.)] is now optimised internally to use data.table’s fast order. data.table always reorders in”C-locale" (see Details). To sort by session locale, use x[base::order(.)]."
#SQL: 
query = 
'
SELECT *
FROM (
 SELECT playerID, sum(HR) as HR
 FROM BATTING
 WHERE yearID > 2000
 GROUP BY playerID
) 
WHERE HR > 400
ORDER BY -HR
'
lahman %>% tbl(sql(query))  %>% collect()# A tibble: 8 x 2
  playerID     HR
  <chr>     <int>
1 pujolal01   591
2 ortizda01   521
3 rodrial01   507
4 dunnad01    462
5 cabremi01   446
6 soriaal01   409
7 teixema01   409
8 beltrad01   403
# dplyr: arrange
batting_tbl %>% 
  filter(yearID > 2000) %>%
  group_by(playerID) %>%
  summarize( HR = sum(HR) ) %>%
  ## Here's the pipe from above
  filter( HR > 400) %>%
  arrange( desc(HR) )# A tibble: 8 x 2
  playerID     HR
  <chr>     <int>
1 pujolal01   591
2 ortizda01   521
3 rodrial01   507
4 dunnad01    462
5 cabremi01   446
6 soriaal01   409
7 teixema01   409
8 beltrad01   403
.N     playerID N
 1: johnske05 2
 2:  kempma01 2
 3:  hillaa01 2
 4: coghlch01 2
 5: bruceja01 2
 6: gomezca01 2
 7: reddijo01 2
 8: lucrojo01 2
 9: nunezed02 2
10: beltrca01 2
11: uptonbj01 2
In SQL we often use COUNT to determine how many records in a group meet some condition. In dplyr we can use n() within a function to count the number of rows by group. In data.table the special symbol .N plays the same role.
Here we use .N to find how many players have more than 100 AB with two or more teams in 2016.
# data.table: 20-20
batting_dt[yearID > 2000 , .(SB = sum(SB), HR = sum(HR)), .(playerID, yearID)] %>%
  .[SB > 19 & HR > 19] %>% .[ , .N, yearID]    yearID  N
 1:   2001 15
 2:   2002  9
 3:   2003  6
 4:   2004  9
 5:   2005  5
 6:   2006  7
 7:   2007 14
 8:   2008  9
 9:   2009 14
10:   2010  7
11:   2011 12
12:   2012 10
13:   2013  9
14:   2014  5
15:   2015  4
16:   2016  9
Here we use .N to count the number of “20-20” players in each year since 2000.
#SQL: stints example
query = 
'
SELECT playerID, COUNT(teamID) as N
FROM BATTING
WHERE yearID = 2016 AND AB > 99
GROUP BY playerID
HAVING N > 1
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 11 x 2
   playerID      N
   <chr>     <int>
 1 beltrca01     2
 2 bruceja01     2
 3 coghlch01     2
 4 gomezca01     2
 5 hillaa01      2
 6 johnske05     2
 7 kempma01      2
 8 lucrojo01     2
 9 nunezed02     2
10 reddijo01     2
11 uptonbj01     2
#SQL: 20-20
query = 
'
SELECT yearID, COUNT(playerID) as N
FROM (
 SELECT playerID, yearID, sum(SB) as SB, sum(HR) as HR
 FROM BATTING
 WHERE yearID > 2000
 GROUP BY playerID, yearID
 HAVING SB > 19 AND HR > 19
)
GROUP BY yearID
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 16 x 2
   yearID     N
    <int> <int>
 1   2001    15
 2   2002     8
 3   2003     4
 4   2004     9
 5   2005     5
 6   2006     7
 7   2007    14
 8   2008     9
 9   2009    14
10   2010     7
11   2011    12
12   2012     9
13   2013     9
14   2014     5
15   2015     4
16   2016     8
# dplyr: n()
batting_tbl %>% 
  filter(AB > 99 & yearID == 2016) %>%
  group_by(playerID, yearID) %>%
  summarize(N=n()) %>%
  filter(N > 1)# A tibble: 11 x 3
# Groups:   playerID [11]
   playerID  yearID     N
   <chr>      <int> <int>
 1 beltrca01   2016     2
 2 bruceja01   2016     2
 3 coghlch01   2016     2
 4 gomezca01   2016     2
 5 hillaa01    2016     2
 6 johnske05   2016     2
 7 kempma01    2016     2
 8 lucrojo01   2016     2
 9 nunezed02   2016     2
10 reddijo01   2016     2
11 uptonbj01   2016     2
# dplyr: n()
batting_tbl %>%
  filter( yearID > 2000) %>%
  group_by(yearID, playerID) %>%
  summarize( SB = sum(SB), HR = sum(HR) ) %>%
  filter( SB > 19 & HR > 19) %>%
  summarize( n = n() )# A tibble: 16 x 2
   yearID     n
    <int> <int>
 1   2001    15
 2   2002     9
 3   2003     6
 4   2004     9
 5   2005     5
 6   2006     7
 7   2007    14
 8   2008     9
 9   2009    14
10   2010     7
11   2011    12
12   2012    10
13   2013     9
14   2014     5
15   2015     4
16   2016     9
Write data.table code to match each of the SQL or dplyr queries to the right.
Find the 2016 average for all playerID with team “DET” and at least 100 AB.
Find the playerID from team “DET” with the highest average in 2016, with a minimum of 100 AB.
Find the playerID who hit the most total HRs between 2001 and 2010.
Find all playerIDs with at least 200 hits in 2016 (across all stints) sorted in descending order.
Find the number of playerIDs with at least 200 hits in each year since 2000.
     playerID       avg
 1: avilemi01 0.2095808
 2: cabremi01 0.3159664
 3: casteni01 0.2846715
 4: collity01 0.2352941
 5: iglesjo01 0.2548180
 6: kinslia01 0.2880259
 7: martijd02 0.3065217
 8: martivi01 0.2893309
 9: maybica01 0.3151862
10: mccanja02 0.2209302
11: rominan01 0.2356322
12: saltaja01 0.1707317
13: uptonju01 0.2456140
# data.table
batting_dt[ teamID == 'DET' & yearID == 2016 & AB >= 100, 
            .(playerID, avg = H / AB)][avg==max(avg)]    playerID       avg
1: cabremi01 0.3159664
    playerID  HR
1: rodrial01 424
    playerID   H
1: altuvjo01 216
2: bettsmo01 214
3: segurje01 203
4: pedrodu01 201
# data.table
batting_dt[yearID > 1999, .(H = sum(H)), .(playerID, yearID) ][ H>=200 ] %>%
   .[ , .N, yearID] %>% .[order(-yearID)]    yearID N
 1:   2016 4
 2:   2015 2
 3:   2014 2
 4:   2012 2
 5:   2011 5
 6:   2010 2
 7:   2009 4
 8:   2008 3
 9:   2007 8
10:   2006 8
11:   2005 3
12:   2004 8
13:   2003 7
14:   2002 5
15:   2001 6
16:   2000 6
query = 
'
SELECT playerID, ( Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING 
WHERE teamID = "DET" AND yearID = 2016 AND AB > 99
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 13 x 2
   playerID    avg
   <chr>     <dbl>
 1 avilemi01 0.210
 2 cabremi01 0.316
 3 casteni01 0.285
 4 collity01 0.235
 5 iglesjo01 0.255
 6 kinslia01 0.288
 7 martijd02 0.307
 8 martivi01 0.289
 9 maybica01 0.315
10 mccanja02 0.221
11 rominan01 0.236
12 saltaja01 0.171
13 uptonju01 0.246
query = 
'
SELECT playerID, max( Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING 
WHERE teamID = "DET" AND yearID = 2016 AND AB > 99
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 1 x 2
  playerID    avg
  <chr>     <dbl>
1 cabremi01 0.316
query = 
'
SELECT playerID, max(HR)
FROM (
 SELECT playerID, sum(HR) as HR
 FROM BATTING 
 WHERE yearID < 2011 AND yearID > 2000
 GROUP BY playerID
) 
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 1 x 2
  playerID  `max(HR)`
  <chr>         <int>
1 rodrial01       424
query = 
'
SELECT playerID, sum(H) as H
FROM BATTING 
WHERE yearID = 2016
GROUP BY playerID
HAVING H > 199
ORDER BY -H
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 4 x 2
  playerID      H
  <chr>     <int>
1 altuvjo01   216
2 bettsmo01   214
3 segurje01   203
4 pedrodu01   201
query = 
'
SELECT yearID as year, COUNT(playerID) as N
FROM (
 SELECT playerID, yearID, sum(H) as H
 FROM BATTING 
 WHERE yearID > 1999
 GROUP BY playerID, yearID
 HAVING H > 199
) 
GROUP BY yearID
ORDER BY -yearID
'
lahman %>% tbl(sql(query)) %>% collect()# A tibble: 16 x 2
    year     N
   <int> <int>
 1  2016     4
 2  2015     2
 3  2014     2
 4  2012     2
 5  2011     5
 6  2010     2
 7  2009     4
 8  2008     3
 9  2007     8
10  2006     8
11  2005     3
12  2004     8
13  2003     7
14  2002     5
15  2001     6
16  2000     6
batting_tbl %>%
  filter(yearID == 2016 & teamID == 'DET' & AB > 100) %>%
  transmute(playerID, avg = H / AB)# A tibble: 13 x 2
   playerID    avg
   <chr>     <dbl>
 1 avilemi01 0.210
 2 cabremi01 0.316
 3 casteni01 0.285
 4 collity01 0.235
 5 iglesjo01 0.255
 6 kinslia01 0.288
 7 martijd02 0.307
 8 martivi01 0.289
 9 maybica01 0.315
10 mccanja02 0.221
11 rominan01 0.236
12 saltaja01 0.171
13 uptonju01 0.246
batting_tbl %>%
  filter(yearID == 2016 & teamID == 'DET' & AB > 100) %>%
  transmute(playerID, avg = H / AB) %>%
  filter(avg == max(avg))# A tibble: 1 x 2
  playerID    avg
  <chr>     <dbl>
1 cabremi01 0.316
batting_tbl %>%
  filter(yearID %in% 2001:2010) %>%
  group_by(playerID) %>%
  summarize(HR = sum(HR)) %>%
  filter(HR == max(HR))# A tibble: 1 x 2
  playerID     HR
  <chr>     <int>
1 rodrial01   424
batting_tbl %>%
  filter(yearID %in% 2016) %>%
  group_by(playerID) %>%
  summarize(H = sum(H)) %>%
  filter(H > 199) %>%
  arrange( desc(H) )# A tibble: 4 x 2
  playerID      H
  <chr>     <int>
1 altuvjo01   216
2 bettsmo01   214
3 segurje01   203
4 pedrodu01   201
batting_tbl %>%
  filter(yearID > 1999) %>%
  group_by(yearID, playerID) %>%
  summarize(H = sum(H)) %>% # still grouped by yearIDs
  filter(H > 199) %>%       # or just summarize( n = sum(H > 199))
  summarize( n = n() ) %>%
  arrange( desc(yearID) )# A tibble: 16 x 2
   yearID     n
    <int> <int>
 1   2016     4
 2   2015     2
 3   2014     2
 4   2012     2
 5   2011     5
 6   2010     2
 7   2009     4
 8   2008     3
 9   2007     8
10   2006     8
11   2005     3
12   2004     8
13   2003     7
14   2002     5
15   2001     6
16   2000     6