Getting Started

Column

Packages and data

# Packages: -------------------------------------------------------------------
library(tidyverse)
library(dbplyr)
library(Lahman)
library(data.table)
#lapply( c('tidyverse', 'RSQLite', 'dbplyr', 'Lahman', 'data.table'),
#           install.packages)

# Create a local SQLlite database of the Lahman data: ----------------------
#! If this fails, run: install.packages('RSQLite')
lahman = lahman_sqlite()

# Copy the batting table to memory as a tibble: -------------------------------
batting_tbl = lahman %>% 
  tbl("BATTING") %>% 
  collect()
class(batting_tbl)

[1] "tbl_df"     "tbl"        "data.frame"

# Convert the copy in memory to a data.table: ---------------------------------
batting_dt = as.data.table(batting_tbl)
class(batting_dt)

[1] "data.table" "data.frame"

Column

Vignettes

Introduction to data.table
Reference Semantics
Efficient reshaping using data.tables
The following points from the datatable-faq: 1.5, 1.6, 2.1, 2.3, 2.16, 2.17, 2.21, 2.23

Select with `j`

Column

data.table

# data.table
batting_dt[ , .(playerID, yearID, league = lgID, stint)]

         playerID yearID league stint
     1: abercda01   1871     NA     1
     2:  addybo01   1871     NA     1
     3: allisar01   1871     NA     1
     4: allisdo01   1871     NA     1
     5: ansonca01   1871     NA     1
    ---                              
102812: zimmejo02   2016     AL     1
102813: zimmery01   2016     NL     1
102814: zobribe01   2016     NL     1
102815: zuninmi01   2016     AL     1
102816:  zychto01   2016     AL     1

The j clause in a data.table is used to select columns. The .() is an alias for list() within a data.table. Each element of the list becomes a column in the resulting data.table. We can provide new names for the columns by naming the list elements.

Column

SQL

#SQL: SELECT
query = 
'
SELECT playerID, yearID, lgID as league, stint
FROM BATTING
'
lahman %>% tbl(sql(query))

# Source:   SQL [?? x 4]
# Database: sqlite 3.22.0
#   [/var/folders/t6/m1sbk4dn10181ns0jx_ss0s00000gn/T//RtmpcQyRLQ/lahman.sqlite]
   playerID  yearID league stint
   <chr>      <int> <chr>  <int>
 1 abercda01   1871 NA         1
 2 addybo01    1871 NA         1
 3 allisar01   1871 NA         1
 4 allisdo01   1871 NA         1
 5 ansonca01   1871 NA         1
 6 armstbo01   1871 NA         1
 7 barkeal01   1871 NA         1
 8 barnero01   1871 NA         1
 9 barrebi01   1871 NA         1
10 barrofr01   1871 NA         1
# … with more rows

dplyr

# dplyr: select, rename, transmute
batting_tbl %>%
    transmute(playerID, yearID, league = lgID, stint)

# A tibble: 102,816 x 4
   playerID  yearID league stint
   <chr>      <int> <chr>  <int>
 1 abercda01   1871 NA         1
 2 addybo01    1871 NA         1
 3 allisar01   1871 NA         1
 4 allisdo01   1871 NA         1
 5 ansonca01   1871 NA         1
 6 armstbo01   1871 NA         1
 7 barkeal01   1871 NA         1
 8 barnero01   1871 NA         1
 9 barrebi01   1871 NA         1
10 barrofr01   1871 NA         1
# … with 102,806 more rows

batting_tbl %>%
    select(playerID, yearID, league = lgID, stint)

# A tibble: 102,816 x 4
   playerID  yearID league stint
   <chr>      <int> <chr>  <int>
 1 abercda01   1871 NA         1
 2 addybo01    1871 NA         1
 3 allisar01   1871 NA         1
 4 allisdo01   1871 NA         1
 5 ansonca01   1871 NA         1
 6 armstbo01   1871 NA         1
 7 barkeal01   1871 NA         1
 8 barnero01   1871 NA         1
 9 barrebi01   1871 NA         1
10 barrofr01   1871 NA         1
# … with 102,806 more rows

Compute in `j`

Column

data.table

# data.table
batting_dt[ , .(playerID, avg = H / AB)]

         playerID       avg
     1: abercda01 0.0000000
     2:  addybo01 0.2711864
     3: allisar01 0.2919708
     4: allisdo01 0.3308271
     5: ansonca01 0.3250000
    ---                    
102812: zimmejo02 0.2500000
102813: zimmery01 0.2177986
102814: zobribe01 0.2715105
102815: zuninmi01 0.2073171
102816:  zychto01       NaN

Any valid R expression can be used in “j” and we may reference the columns as variables. This is similar to the behavior of the R function with(). As the long as the expression returns a list the result is a data.table.

Column

SQL

#SQL: 
query = 
'
SELECT playerID, 
       (Cast (H as Float) /  Cast (AB as Float) ) as avg
FROM BATTING
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 102,816 x 2
   playerID    avg
   <chr>     <dbl>
 1 abercda01 0    
 2 addybo01  0.271
 3 allisar01 0.292
 4 allisdo01 0.331
 5 ansonca01 0.325
 6 armstbo01 0.224
 7 barkeal01 0.25 
 8 barnero01 0.401
 9 barrebi01 0.2  
10 barrofr01 0.151
# … with 102,806 more rows

dplyr

# dplyr: transmute
batting_tbl %>%
  transmute( playerID, avg = H / AB)

# A tibble: 102,816 x 2
   playerID    avg
   <chr>     <dbl>
 1 abercda01 0    
 2 addybo01  0.271
 3 allisar01 0.292
 4 allisdo01 0.331
 5 ansonca01 0.325
 6 armstbo01 0.224
 7 barkeal01 0.25 
 8 barnero01 0.401
 9 barrebi01 0.2  
10 barrofr01 0.151
# … with 102,806 more rows

Aggregate in `j`

Column

data.table

# data.table: aggregate using a valid R expression
batting_dt[, .(max_HBP = max(HBP, na.rm=TRUE))]

   max_HBP
1:      51

Here we find the maximum HBP entry in the table using a function that returns a single value.

Column

SQL

# SQL: SELECT with aggregation function
query =
'
SELECT max(HBP) as max_hbp
FROM BATTING
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 1 x 1
  max_hbp
    <int>
1      51

dplyr

# dplyr: summarize
batting_tbl %>%
  summarize( max_HBP = max(HBP, na.rm=TRUE))

# A tibble: 1 x 1
  max_HBP
    <dbl>
1      51

Grouping with `by`

Column

data.table: `by`

# data.table
#batting_dt[ , .(avg = sum(H) / sum(AB)), by = .(playerID, yearID, lgID)]
batting_dt[ , .(avg = sum(H) / sum(AB)), .(playerID, yearID, lgID)]

        playerID yearID lgID       avg
    1: abercda01   1871   NA 0.0000000
    2:  addybo01   1871   NA 0.2711864
    3: allisar01   1871   NA 0.2919708
    4: allisdo01   1871   NA 0.3308271
    5: ansonca01   1871   NA 0.3250000
   ---                                
97776: zimmejo02   2016   AL 0.2500000
97777: zimmery01   2016   NL 0.2177986
97778: zobribe01   2016   NL 0.2715105
97779: zuninmi01   2016   AL 0.2073171
97780:  zychto01   2016   AL       NaN

We can perform grouped operations in j using the by argument. Use a list to form groups using the unique combinations from multiple columns. Observe that unlike the SQL and dplyr versions the resulting table is not re-ordered by group.

data.table: `keyby`

# data.table: keyby
# Here the parameter name is required
batting_dt[ , .(avg = sum(H) / sum(AB)), keyby = .(playerID, yearID, lgID)]

        playerID yearID lgID       avg
    1: aardsda01   2004   NL       NaN
    2: aardsda01   2006   NL 0.0000000
    3: aardsda01   2007   AL       NaN
    4: aardsda01   2008   AL 0.0000000
    5: aardsda01   2009   AL       NaN
   ---                                
97776: zwilldu01   1914   FL 0.3125000
97777: zwilldu01   1915   FL 0.2864964
97778: zwilldu01   1916   NL 0.1132075
97779:  zychto01   2015   AL       NaN
97780:  zychto01   2016   AL       NaN

If we use keyby the new data.table is reordered by group.

Column

SQL

#SQL: 
query = 
'
SELECT playerID, yearID, lgID,
       ( Cast (sum(H) as Float) / Cast (sum(AB) as Float) ) as avg
FROM BATTING
GROUP BY playerID, yearID, lgID
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 97,780 x 4
   playerID  yearID lgID     avg
   <chr>      <int> <chr>  <dbl>
 1 aardsda01   2004 NL    NA    
 2 aardsda01   2006 NL     0    
 3 aardsda01   2007 AL    NA    
 4 aardsda01   2008 AL     0    
 5 aardsda01   2009 AL    NA    
 6 aardsda01   2010 AL    NA    
 7 aardsda01   2012 AL    NA    
 8 aardsda01   2013 NL    NA    
 9 aardsda01   2015 NL     0    
10 aaronha01   1954 NL     0.280
# … with 97,770 more rows

dplyr

# dplyr: group_by
batting_tbl %>% 
  group_by(playerID, yearID, lgID) %>%
  summarize( avg = sum(H) / sum(AB) )

# A tibble: 97,780 x 4
# Groups:   playerID, yearID [95,250]
   playerID  yearID lgID      avg
   <chr>      <int> <chr>   <dbl>
 1 aardsda01   2004 NL    NaN    
 2 aardsda01   2006 NL      0    
 3 aardsda01   2007 AL    NaN    
 4 aardsda01   2008 AL      0    
 5 aardsda01   2009 AL    NaN    
 6 aardsda01   2010 AL    NaN    
 7 aardsda01   2012 AL    NaN    
 8 aardsda01   2013 NL    NaN    
 9 aardsda01   2015 NL      0    
10 aaronha01   1954 NL      0.280
# … with 97,770 more rows

Select rows in `i`

Column

data.table: logical indexing

# data.table
batting_dt[ yearID == 2016, .(playerID, HBP)]

       playerID HBP
   1:  abadfe01   0
   2:  abadfe01   0
   3: abreujo02  15
   4: achteaj01   0
   5: ackledu01   0
  ---              
1479: zimmejo02   0
1480: zimmery01   5
1481: zobribe01   4
1482: zuninmi01   6
1483:  zychto01   0

The i clause in a data.table is used to select rows. Similar to dplyr::filter or subsetting a data.frame we often use it with a logical index whose length matches the number of rows.

data.table: keys

# data.table
setkey(batting_dt, 'teamID')
batting_dt['DET', .(playerID, teamID, HBP)]

       playerID teamID HBP
   1: barreji01    DET   5
   2: buelofr01    DET   3
   3: caseydo01    DET  10
   4: crockda01    DET   2
   5: cronija01    DET   1
  ---                     
4212: verlaju01    DET   0
4213: wilsoal01    DET   0
4214: wilsobo02    DET   0
4215: wilsoju10    DET   0
4216: zimmejo02    DET   0

key(batting_dt)

[1] "teamID"

setkey(batting_dt, 'yearID')
batting_dt[.(2016), .(playerID, yearID, HBP)]

       playerID yearID HBP
   1: ahmedni01   2016   4
   2: barreja01   2016   0
   3: bournmi01   2016   0
   4: brachsi01   2016   0
   5: bradlar01   2016   0
  ---                     
1479: taylomi02   2016   1
1480: treinbl01   2016   0
1481: turnetr01   2016   1
1482: werthja01   2016   4
1483: zimmery01   2016   5

# Compare the difference
batting_dt[2016, .(playerID, yearID, HBP)]

    playerID yearID HBP
1: trottsa01   1884   2

# Remove key for later examples
setkey(batting_dt, NULL)

If our data.table has one or more columns designated as a key, we can subset by value in i. Note the need to pass integer-valued keys within a list to differentiate from simple subsetting by integer.

When we set a key using setkey() the rows are reordered to enable faster subsetting by key. This is similar to “indexing” in SQL. Read more about keys in this vignette.

Column

SQL

#SQL: WHERE
query = 
'
SELECT playerID, HBP
FROM BATTING
WHERE yearID = 2016
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 1,483 x 2
   playerID    HBP
   <chr>     <int>
 1 abadfe01      0
 2 abadfe01      0
 3 abreujo02    15
 4 achteaj01     0
 5 ackledu01     0
 6 adamecr01     4
 7 adamsau01     0
 8 adamsma01     2
 9 adlemti01     0
10 adriaeh01     2
# … with 1,473 more rows

dplyr

# dplyr: filter
batting_tbl %>%
  filter(yearID == 2016) %>%
  select(playerID, HBP)

# A tibble: 1,483 x 2
   playerID    HBP
   <chr>     <int>
 1 abadfe01      0
 2 abadfe01      0
 3 abreujo02    15
 4 achteaj01     0
 5 ackledu01     0
 6 adamecr01     4
 7 adamsau01     0
 8 adamsma01     2
 9 adlemti01     0
10 adriaeh01     2
# … with 1,473 more rows

base R

batting_tbl[ batting_tbl$yearID == 2016, c("playerID", "yearID", "HBP")]

# A tibble: 1,483 x 3
   playerID  yearID   HBP
   <chr>      <int> <int>
 1 abadfe01    2016     0
 2 abadfe01    2016     0
 3 abreujo02   2016    15
 4 achteaj01   2016     0
 5 ackledu01   2016     0
 6 adamecr01   2016     4
 7 adamsau01   2016     0
 8 adamsma01   2016     2
 9 adlemti01   2016     0
10 adriaeh01   2016     2
# … with 1,473 more rows

Chaining

Column

data.table: pipes

# data.table
batting_dt[ yearID > 2000, .(HR = sum(HR)), .(playerID)] %>%
  .[HR > 400]

    playerID  HR
1:  dunnad01 462
2: beltrad01 403
3: ortizda01 521
4: soriaal01 409
5: pujolal01 591
6: rodrial01 507
7: cabremi01 446
8: teixema01 409

Those familiar with dplyr will be familiar with the “pipe” operation %>% that calls the function on the right with the object on the left (usually data) as its first parameter. We can take advantage of the . used by margittr::%>% to use pipes with data.table’s [] function.

data.table: chaining

    playerID  HR
1:  dunnad01 462
2: beltrad01 403
3: ortizda01 521
4: soriaal01 409
5: pujolal01 591
6: rodrial01 507
7: cabremi01 446
8: teixema01 409

You can also simply “chain” the [] statements together.

Column

SQL

#SQL: nested anonymous table, "HAVING"
query = 
'
SELECT *
FROM (
 SELECT playerID, sum(HR) as HR
 FROM BATTING
 WHERE yearID > 2000
 GROUP BY playerID
) 
WHERE HR > 400
'

dplyr

# dplyr: Use %>% to chain
batting_tbl %>% 
  filter(yearID > 2000) %>%
  group_by(playerID) %>%
  summarize( HR = sum(HR) ) %>%
  ## Here's the pipe from above
  filter( HR > 400)

# A tibble: 8 x 2
  playerID     HR
  <chr>     <int>
1 beltrad01   403
2 cabremi01   446
3 dunnad01    462
4 ortizda01   521
5 pujolal01   591
6 rodrial01   507
7 soriaal01   409
8 teixema01   409

Order in `i`

Column

data.table

# data.table
batting_dt[ yearID > 2000, .(HR = sum(HR)), .(playerID)][HR > 400][order(-HR)]

    playerID  HR
1: pujolal01 591
2: ortizda01 521
3: rodrial01 507
4:  dunnad01 462
5: cabremi01 446
6: soriaal01 409
7: teixema01 409
8: beltrad01 403

Use order() in the i clause to sort a data.table.

From `help(data.table::order)`:

“Also x[order(.)] is now optimised internally to use data.table’s fast order. data.table always reorders in”C-locale" (see Details). To sort by session locale, use x[base::order(.)]."

Column

SQL

#SQL: 
query = 
'
SELECT *
FROM (
 SELECT playerID, sum(HR) as HR
 FROM BATTING
 WHERE yearID > 2000
 GROUP BY playerID
) 
WHERE HR > 400
ORDER BY -HR
'
lahman %>% tbl(sql(query))  %>% collect()

# A tibble: 8 x 2
  playerID     HR
  <chr>     <int>
1 pujolal01   591
2 ortizda01   521
3 rodrial01   507
4 dunnad01    462
5 cabremi01   446
6 soriaal01   409
7 teixema01   409
8 beltrad01   403

dplyr

# dplyr: arrange
batting_tbl %>% 
  filter(yearID > 2000) %>%
  group_by(playerID) %>%
  summarize( HR = sum(HR) ) %>%
  ## Here's the pipe from above
  filter( HR > 400) %>%
  arrange( desc(HR) )

# A tibble: 8 x 2
  playerID     HR
  <chr>     <int>
1 pujolal01   591
2 ortizda01   521
3 rodrial01   507
4 dunnad01    462
5 cabremi01   446
6 soriaal01   409
7 teixema01   409
8 beltrad01   403

Using `.N`

Column

data.table: multiple stints

# data.table: stints example
batting_dt[yearID == 2016 & AB > 99, .N, .(playerID)][N>1]

     playerID N
 1: johnske05 2
 2:  kempma01 2
 3:  hillaa01 2
 4: coghlch01 2
 5: bruceja01 2
 6: gomezca01 2
 7: reddijo01 2
 8: lucrojo01 2
 9: nunezed02 2
10: beltrca01 2
11: uptonbj01 2

In SQL we often use COUNT to determine how many records in a group meet some condition. In dplyr we can use n() within a function to count the number of rows by group. In data.table the special symbol .N plays the same role.

Here we use .N to find how many players have more than 100 AB with two or more teams in 2016.

data.table: 20-20

# data.table: 20-20
batting_dt[yearID > 2000 , .(SB = sum(SB), HR = sum(HR)), .(playerID, yearID)] %>%
  .[SB > 19 & HR > 19] %>% .[ , .N, yearID]

    yearID  N
 1:   2001 15
 2:   2002  9
 3:   2003  6
 4:   2004  9
 5:   2005  5
 6:   2006  7
 7:   2007 14
 8:   2008  9
 9:   2009 14
10:   2010  7
11:   2011 12
12:   2012 10
13:   2013  9
14:   2014  5
15:   2015  4
16:   2016  9

Here we use .N to count the number of “20-20” players in each year since 2000.

Column

SQL: stints

#SQL: stints example
query = 
'
SELECT playerID, COUNT(teamID) as N
FROM BATTING
WHERE yearID = 2016 AND AB > 99
GROUP BY playerID
HAVING N > 1
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 11 x 2
   playerID      N
   <chr>     <int>
 1 beltrca01     2
 2 bruceja01     2
 3 coghlch01     2
 4 gomezca01     2
 5 hillaa01      2
 6 johnske05     2
 7 kempma01      2
 8 lucrojo01     2
 9 nunezed02     2
10 reddijo01     2
11 uptonbj01     2

SQL: 20-20

#SQL: 20-20
query = 
'
SELECT yearID, COUNT(playerID) as N
FROM (
 SELECT playerID, yearID, sum(SB) as SB, sum(HR) as HR
 FROM BATTING
 WHERE yearID > 2000
 GROUP BY playerID, yearID
 HAVING SB > 19 AND HR > 19
)
GROUP BY yearID
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 16 x 2
   yearID     N
    <int> <int>
 1   2001    15
 2   2002     8
 3   2003     4
 4   2004     9
 5   2005     5
 6   2006     7
 7   2007    14
 8   2008     9
 9   2009    14
10   2010     7
11   2011    12
12   2012     9
13   2013     9
14   2014     5
15   2015     4
16   2016     8

dplyr: stints

# dplyr: n()
batting_tbl %>% 
  filter(AB > 99 & yearID == 2016) %>%
  group_by(playerID, yearID) %>%
  summarize(N=n()) %>%
  filter(N > 1)

# A tibble: 11 x 3
# Groups:   playerID [11]
   playerID  yearID     N
   <chr>      <int> <int>
 1 beltrca01   2016     2
 2 bruceja01   2016     2
 3 coghlch01   2016     2
 4 gomezca01   2016     2
 5 hillaa01    2016     2
 6 johnske05   2016     2
 7 kempma01    2016     2
 8 lucrojo01   2016     2
 9 nunezed02   2016     2
10 reddijo01   2016     2
11 uptonbj01   2016     2

dplyr: 20-20

# dplyr: n()
batting_tbl %>%
  filter( yearID > 2000) %>%
  group_by(yearID, playerID) %>%
  summarize( SB = sum(SB), HR = sum(HR) ) %>%
  filter( SB > 19 & HR > 19) %>%
  summarize( n = n() )

# A tibble: 16 x 2
   yearID     n
    <int> <int>
 1   2001    15
 2   2002     9
 3   2003     6
 4   2004     9
 5   2005     5
 6   2006     7
 7   2007    14
 8   2008     9
 9   2009    14
10   2010     7
11   2011    12
12   2012    10
13   2013     9
14   2014     5
15   2015     4
16   2016     9

Exercises

Column

data.table: Instructions

Write data.table code to match each of the SQL or dplyr queries to the right.

Find the 2016 average for all playerID with team “DET” and at least 100 AB.
Find the playerID from team “DET” with the highest average in 2016, with a minimum of 100 AB.
Find the playerID who hit the most total HRs between 2001 and 2010.
Find all playerIDs with at least 200 hits in 2016 (across all stints) sorted in descending order.
Find the number of playerIDs with at least 200 hits in each year since 2000.

solution-1

# data.table
batting_dt[ teamID == 'DET' & yearID == 2016 & AB >= 100, 
            .(playerID, avg = H / AB)]

     playerID       avg
 1: avilemi01 0.2095808
 2: cabremi01 0.3159664
 3: casteni01 0.2846715
 4: collity01 0.2352941
 5: iglesjo01 0.2548180
 6: kinslia01 0.2880259
 7: martijd02 0.3065217
 8: martivi01 0.2893309
 9: maybica01 0.3151862
10: mccanja02 0.2209302
11: rominan01 0.2356322
12: saltaja01 0.1707317
13: uptonju01 0.2456140

solution-2

# data.table
batting_dt[ teamID == 'DET' & yearID == 2016 & AB >= 100, 
            .(playerID, avg = H / AB)][avg==max(avg)]

    playerID       avg
1: cabremi01 0.3159664

solution-3

# data.table
batting_dt[yearID %in% 2001:2010, .(HR = sum(HR)), .(playerID)] %>%
 .[which.max(HR)]

    playerID  HR
1: rodrial01 424

solution-4

# data.table
batting_dt[yearID==2016, .(H = sum(H)), .(playerID) ][ H>=200 ][ order(-H) ]

    playerID   H
1: altuvjo01 216
2: bettsmo01 214
3: segurje01 203
4: pedrodu01 201

solution-5

# data.table
batting_dt[yearID > 1999, .(H = sum(H)), .(playerID, yearID) ][ H>=200 ] %>%
   .[ , .N, yearID] %>% .[order(-yearID)]

    yearID N
 1:   2016 4
 2:   2015 2
 3:   2014 2
 4:   2012 2
 5:   2011 5
 6:   2010 2
 7:   2009 4
 8:   2008 3
 9:   2007 8
10:   2006 8
11:   2005 3
12:   2004 8
13:   2003 7
14:   2002 5
15:   2001 6
16:   2000 6

Column

SQL-1

query = 
'
SELECT playerID, ( Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING 
WHERE teamID = "DET" AND yearID = 2016 AND AB > 99
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 13 x 2
   playerID    avg
   <chr>     <dbl>
 1 avilemi01 0.210
 2 cabremi01 0.316
 3 casteni01 0.285
 4 collity01 0.235
 5 iglesjo01 0.255
 6 kinslia01 0.288
 7 martijd02 0.307
 8 martivi01 0.289
 9 maybica01 0.315
10 mccanja02 0.221
11 rominan01 0.236
12 saltaja01 0.171
13 uptonju01 0.246

SQL-2

query = 
'
SELECT playerID, max( Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING 
WHERE teamID = "DET" AND yearID = 2016 AND AB > 99
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 1 x 2
  playerID    avg
  <chr>     <dbl>
1 cabremi01 0.316

SQL-3

query = 
'
SELECT playerID, max(HR)
FROM (
 SELECT playerID, sum(HR) as HR
 FROM BATTING 
 WHERE yearID < 2011 AND yearID > 2000
 GROUP BY playerID
) 
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 1 x 2
  playerID  `max(HR)`
  <chr>         <int>
1 rodrial01       424

SQL-4

query = 
'
SELECT playerID, sum(H) as H
FROM BATTING 
WHERE yearID = 2016
GROUP BY playerID
HAVING H > 199
ORDER BY -H
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 4 x 2
  playerID      H
  <chr>     <int>
1 altuvjo01   216
2 bettsmo01   214
3 segurje01   203
4 pedrodu01   201

SQL-5

query = 
'
SELECT yearID as year, COUNT(playerID) as N
FROM (
 SELECT playerID, yearID, sum(H) as H
 FROM BATTING 
 WHERE yearID > 1999
 GROUP BY playerID, yearID
 HAVING H > 199
) 
GROUP BY yearID
ORDER BY -yearID
'
lahman %>% tbl(sql(query)) %>% collect()

# A tibble: 16 x 2
    year     N
   <int> <int>
 1  2016     4
 2  2015     2
 3  2014     2
 4  2012     2
 5  2011     5
 6  2010     2
 7  2009     4
 8  2008     3
 9  2007     8
10  2006     8
11  2005     3
12  2004     8
13  2003     7
14  2002     5
15  2001     6
16  2000     6

dplyr-1

batting_tbl %>%
  filter(yearID == 2016 & teamID == 'DET' & AB > 100) %>%
  transmute(playerID, avg = H / AB)

# A tibble: 13 x 2
   playerID    avg
   <chr>     <dbl>
 1 avilemi01 0.210
 2 cabremi01 0.316
 3 casteni01 0.285
 4 collity01 0.235
 5 iglesjo01 0.255
 6 kinslia01 0.288
 7 martijd02 0.307
 8 martivi01 0.289
 9 maybica01 0.315
10 mccanja02 0.221
11 rominan01 0.236
12 saltaja01 0.171
13 uptonju01 0.246

dplyr-2

batting_tbl %>%
  filter(yearID == 2016 & teamID == 'DET' & AB > 100) %>%
  transmute(playerID, avg = H / AB) %>%
  filter(avg == max(avg))

# A tibble: 1 x 2
  playerID    avg
  <chr>     <dbl>
1 cabremi01 0.316

dplyr-3

batting_tbl %>%
  filter(yearID %in% 2001:2010) %>%
  group_by(playerID) %>%
  summarize(HR = sum(HR)) %>%
  filter(HR == max(HR))

# A tibble: 1 x 2
  playerID     HR
  <chr>     <int>
1 rodrial01   424

dplyr-4

batting_tbl %>%
  filter(yearID %in% 2016) %>%
  group_by(playerID) %>%
  summarize(H = sum(H)) %>%
  filter(H > 199) %>%
  arrange( desc(H) )

# A tibble: 4 x 2
  playerID      H
  <chr>     <int>
1 altuvjo01   216
2 bettsmo01   214
3 segurje01   203
4 pedrodu01   201

dplyr-5

batting_tbl %>%
  filter(yearID > 1999) %>%
  group_by(yearID, playerID) %>%
  summarize(H = sum(H)) %>% # still grouped by yearIDs
  filter(H > 199) %>%       # or just summarize( n = sum(H > 199))
  summarize( n = n() ) %>%
  arrange( desc(yearID) )

# A tibble: 16 x 2
   yearID     n
    <int> <int>
 1   2016     4
 2   2015     2
 3   2014     2
 4   2012     2
 5   2011     5
 6   2010     2
 7   2009     4
 8   2008     3
 9   2007     8
10   2006     8
11   2005     3
12   2004     8
13   2003     7
14   2002     5
15   2001     6
16   2000     6

Getting Started

Column

Packages and data

Column

Vignettes

Select with j

Column

data.table

Column

SQL

dplyr

Compute in j

Column

data.table

Column

SQL

dplyr

Aggregate in j

Column

data.table

Column

SQL

dplyr

Grouping with by

Column

data.table: by

data.table: keyby

Column

SQL

dplyr

Select rows in i

Column

data.table: logical indexing

data.table: keys

Column

SQL

dplyr

base R

Chaining

Column

data.table: pipes

data.table: chaining

Column

SQL

dplyr

Order in i

Column

data.table

From help(data.table::order):

Column

SQL

dplyr

Using .N

Column

data.table: multiple stints

data.table: 20-20

Column

SQL: stints

SQL: 20-20

dplyr: stints

dplyr: 20-20

Exercises

Column

data.table: Instructions

solution-1

solution-2

solution-3

solution-4

solution-5

Column

SQL-1

SQL-2

SQL-3

SQL-4

SQL-5

dplyr-1

dplyr-2

dplyr-3

dplyr-4

dplyr-5

Select with `j`

Compute in `j`

Aggregate in `j`

Grouping with `by`

data.table: `by`

data.table: `keyby`

Select rows in `i`

Order in `i`

From `help(data.table::order)`:

Using `.N`