# Packages: -------------------------------------------------------------------
library(tidyverse)
library(dbplyr)
library(Lahman)
library(data.table)
#lapply( c('tidyverse', 'RSQLite', 'dbplyr', 'Lahman', 'data.table'),
# install.packages)
# Create a local SQLlite database of the Lahman data: ----------------------
#! If this fails, run: install.packages('RSQLite')
lahman = lahman_sqlite()
# Copy the batting table to memory as a tibble: -------------------------------
batting_tbl = lahman %>%
tbl("BATTING") %>%
collect()
class(batting_tbl)
[1] "tbl_df" "tbl" "data.frame"
# Convert the copy in memory to a data.table: ---------------------------------
batting_dt = as.data.table(batting_tbl)
class(batting_dt)
[1] "data.table" "data.frame"
The following points from the datatable-faq: 1.5, 1.6, 2.1, 2.3, 2.16, 2.17, 2.21, 2.23
j
playerID yearID league stint
1: abercda01 1871 NA 1
2: addybo01 1871 NA 1
3: allisar01 1871 NA 1
4: allisdo01 1871 NA 1
5: ansonca01 1871 NA 1
---
102812: zimmejo02 2016 AL 1
102813: zimmery01 2016 NL 1
102814: zobribe01 2016 NL 1
102815: zuninmi01 2016 AL 1
102816: zychto01 2016 AL 1
The j
clause in a data.table is used to select columns. The .()
is an alias for list()
within a data.table. Each element of the list becomes a column in the resulting data.table. We can provide new names for the columns by naming the list elements.
#SQL: SELECT
query =
'
SELECT playerID, yearID, lgID as league, stint
FROM BATTING
'
lahman %>% tbl(sql(query))
# Source: SQL [?? x 4]
# Database: sqlite 3.22.0
# [/var/folders/t6/m1sbk4dn10181ns0jx_ss0s00000gn/T//RtmpcQyRLQ/lahman.sqlite]
playerID yearID league stint
<chr> <int> <chr> <int>
1 abercda01 1871 NA 1
2 addybo01 1871 NA 1
3 allisar01 1871 NA 1
4 allisdo01 1871 NA 1
5 ansonca01 1871 NA 1
6 armstbo01 1871 NA 1
7 barkeal01 1871 NA 1
8 barnero01 1871 NA 1
9 barrebi01 1871 NA 1
10 barrofr01 1871 NA 1
# … with more rows
# dplyr: select, rename, transmute
batting_tbl %>%
transmute(playerID, yearID, league = lgID, stint)
# A tibble: 102,816 x 4
playerID yearID league stint
<chr> <int> <chr> <int>
1 abercda01 1871 NA 1
2 addybo01 1871 NA 1
3 allisar01 1871 NA 1
4 allisdo01 1871 NA 1
5 ansonca01 1871 NA 1
6 armstbo01 1871 NA 1
7 barkeal01 1871 NA 1
8 barnero01 1871 NA 1
9 barrebi01 1871 NA 1
10 barrofr01 1871 NA 1
# … with 102,806 more rows
# A tibble: 102,816 x 4
playerID yearID league stint
<chr> <int> <chr> <int>
1 abercda01 1871 NA 1
2 addybo01 1871 NA 1
3 allisar01 1871 NA 1
4 allisdo01 1871 NA 1
5 ansonca01 1871 NA 1
6 armstbo01 1871 NA 1
7 barkeal01 1871 NA 1
8 barnero01 1871 NA 1
9 barrebi01 1871 NA 1
10 barrofr01 1871 NA 1
# … with 102,806 more rows
j
playerID avg
1: abercda01 0.0000000
2: addybo01 0.2711864
3: allisar01 0.2919708
4: allisdo01 0.3308271
5: ansonca01 0.3250000
---
102812: zimmejo02 0.2500000
102813: zimmery01 0.2177986
102814: zobribe01 0.2715105
102815: zuninmi01 0.2073171
102816: zychto01 NaN
Any valid R expression can be used in “j” and we may reference the columns as variables. This is similar to the behavior of the R function with()
. As the long as the expression returns a list the result is a data.table.
#SQL:
query =
'
SELECT playerID,
(Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 102,816 x 2
playerID avg
<chr> <dbl>
1 abercda01 0
2 addybo01 0.271
3 allisar01 0.292
4 allisdo01 0.331
5 ansonca01 0.325
6 armstbo01 0.224
7 barkeal01 0.25
8 barnero01 0.401
9 barrebi01 0.2
10 barrofr01 0.151
# … with 102,806 more rows
# A tibble: 102,816 x 2
playerID avg
<chr> <dbl>
1 abercda01 0
2 addybo01 0.271
3 allisar01 0.292
4 allisdo01 0.331
5 ansonca01 0.325
6 armstbo01 0.224
7 barkeal01 0.25
8 barnero01 0.401
9 barrebi01 0.2
10 barrofr01 0.151
# … with 102,806 more rows
j
max_HBP
1: 51
Here we find the maximum HBP entry in the table using a function that returns a single value.
# SQL: SELECT with aggregation function
query =
'
SELECT max(HBP) as max_hbp
FROM BATTING
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 1 x 1
max_hbp
<int>
1 51
# A tibble: 1 x 1
max_HBP
<dbl>
1 51
by
by
# data.table
#batting_dt[ , .(avg = sum(H) / sum(AB)), by = .(playerID, yearID, lgID)]
batting_dt[ , .(avg = sum(H) / sum(AB)), .(playerID, yearID, lgID)]
playerID yearID lgID avg
1: abercda01 1871 NA 0.0000000
2: addybo01 1871 NA 0.2711864
3: allisar01 1871 NA 0.2919708
4: allisdo01 1871 NA 0.3308271
5: ansonca01 1871 NA 0.3250000
---
97776: zimmejo02 2016 AL 0.2500000
97777: zimmery01 2016 NL 0.2177986
97778: zobribe01 2016 NL 0.2715105
97779: zuninmi01 2016 AL 0.2073171
97780: zychto01 2016 AL NaN
We can perform grouped operations in j
using the by
argument. Use a list to form groups using the unique combinations from multiple columns. Observe that unlike the SQL and dplyr versions the resulting table is not re-ordered by group.
keyby
# data.table: keyby
# Here the parameter name is required
batting_dt[ , .(avg = sum(H) / sum(AB)), keyby = .(playerID, yearID, lgID)]
playerID yearID lgID avg
1: aardsda01 2004 NL NaN
2: aardsda01 2006 NL 0.0000000
3: aardsda01 2007 AL NaN
4: aardsda01 2008 AL 0.0000000
5: aardsda01 2009 AL NaN
---
97776: zwilldu01 1914 FL 0.3125000
97777: zwilldu01 1915 FL 0.2864964
97778: zwilldu01 1916 NL 0.1132075
97779: zychto01 2015 AL NaN
97780: zychto01 2016 AL NaN
If we use keyby
the new data.table is reordered by group.
#SQL:
query =
'
SELECT playerID, yearID, lgID,
( Cast (sum(H) as Float) / Cast (sum(AB) as Float) ) as avg
FROM BATTING
GROUP BY playerID, yearID, lgID
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 97,780 x 4
playerID yearID lgID avg
<chr> <int> <chr> <dbl>
1 aardsda01 2004 NL NA
2 aardsda01 2006 NL 0
3 aardsda01 2007 AL NA
4 aardsda01 2008 AL 0
5 aardsda01 2009 AL NA
6 aardsda01 2010 AL NA
7 aardsda01 2012 AL NA
8 aardsda01 2013 NL NA
9 aardsda01 2015 NL 0
10 aaronha01 1954 NL 0.280
# … with 97,770 more rows
# dplyr: group_by
batting_tbl %>%
group_by(playerID, yearID, lgID) %>%
summarize( avg = sum(H) / sum(AB) )
# A tibble: 97,780 x 4
# Groups: playerID, yearID [95,250]
playerID yearID lgID avg
<chr> <int> <chr> <dbl>
1 aardsda01 2004 NL NaN
2 aardsda01 2006 NL 0
3 aardsda01 2007 AL NaN
4 aardsda01 2008 AL 0
5 aardsda01 2009 AL NaN
6 aardsda01 2010 AL NaN
7 aardsda01 2012 AL NaN
8 aardsda01 2013 NL NaN
9 aardsda01 2015 NL 0
10 aaronha01 1954 NL 0.280
# … with 97,770 more rows
i
playerID HBP
1: abadfe01 0
2: abadfe01 0
3: abreujo02 15
4: achteaj01 0
5: ackledu01 0
---
1479: zimmejo02 0
1480: zimmery01 5
1481: zobribe01 4
1482: zuninmi01 6
1483: zychto01 0
The i
clause in a data.table is used to select rows. Similar to dplyr::filter
or subsetting a data.frame we often use it with a logical index whose length matches the number of rows.
playerID teamID HBP
1: barreji01 DET 5
2: buelofr01 DET 3
3: caseydo01 DET 10
4: crockda01 DET 2
5: cronija01 DET 1
---
4212: verlaju01 DET 0
4213: wilsoal01 DET 0
4214: wilsobo02 DET 0
4215: wilsoju10 DET 0
4216: zimmejo02 DET 0
[1] "teamID"
playerID yearID HBP
1: ahmedni01 2016 4
2: barreja01 2016 0
3: bournmi01 2016 0
4: brachsi01 2016 0
5: bradlar01 2016 0
---
1479: taylomi02 2016 1
1480: treinbl01 2016 0
1481: turnetr01 2016 1
1482: werthja01 2016 4
1483: zimmery01 2016 5
playerID yearID HBP
1: trottsa01 1884 2
If our data.table has one or more columns designated as a key, we can subset by value in i
. Note the need to pass integer-valued keys within a list to differentiate from simple subsetting by integer.
When we set a key using setkey()
the rows are reordered to enable faster subsetting by key. This is similar to “indexing” in SQL. Read more about keys in this vignette.
#SQL: WHERE
query =
'
SELECT playerID, HBP
FROM BATTING
WHERE yearID = 2016
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 1,483 x 2
playerID HBP
<chr> <int>
1 abadfe01 0
2 abadfe01 0
3 abreujo02 15
4 achteaj01 0
5 ackledu01 0
6 adamecr01 4
7 adamsau01 0
8 adamsma01 2
9 adlemti01 0
10 adriaeh01 2
# … with 1,473 more rows
# A tibble: 1,483 x 2
playerID HBP
<chr> <int>
1 abadfe01 0
2 abadfe01 0
3 abreujo02 15
4 achteaj01 0
5 ackledu01 0
6 adamecr01 4
7 adamsau01 0
8 adamsma01 2
9 adlemti01 0
10 adriaeh01 2
# … with 1,473 more rows
# A tibble: 1,483 x 3
playerID yearID HBP
<chr> <int> <int>
1 abadfe01 2016 0
2 abadfe01 2016 0
3 abreujo02 2016 15
4 achteaj01 2016 0
5 ackledu01 2016 0
6 adamecr01 2016 4
7 adamsau01 2016 0
8 adamsma01 2016 2
9 adlemti01 2016 0
10 adriaeh01 2016 2
# … with 1,473 more rows
playerID HR
1: dunnad01 462
2: beltrad01 403
3: ortizda01 521
4: soriaal01 409
5: pujolal01 591
6: rodrial01 507
7: cabremi01 446
8: teixema01 409
Those familiar with dplyr
will be familiar with the “pipe” operation %>%
that calls the function on the right with the object on the left (usually data) as its first parameter. We can take advantage of the .
used by margittr::%>%
to use pipes with data.table’s []
function.
playerID HR
1: dunnad01 462
2: beltrad01 403
3: ortizda01 521
4: soriaal01 409
5: pujolal01 591
6: rodrial01 507
7: cabremi01 446
8: teixema01 409
You can also simply “chain” the []
statements together.
# dplyr: Use %>% to chain
batting_tbl %>%
filter(yearID > 2000) %>%
group_by(playerID) %>%
summarize( HR = sum(HR) ) %>%
## Here's the pipe from above
filter( HR > 400)
# A tibble: 8 x 2
playerID HR
<chr> <int>
1 beltrad01 403
2 cabremi01 446
3 dunnad01 462
4 ortizda01 521
5 pujolal01 591
6 rodrial01 507
7 soriaal01 409
8 teixema01 409
i
playerID HR
1: pujolal01 591
2: ortizda01 521
3: rodrial01 507
4: dunnad01 462
5: cabremi01 446
6: soriaal01 409
7: teixema01 409
8: beltrad01 403
Use order()
in the i
clause to sort a data.table.
help(data.table::order)
:“Also x[order(.)] is now optimised internally to use data.table’s fast order. data.table always reorders in”C-locale" (see Details). To sort by session locale, use x[base::order(.)]."
#SQL:
query =
'
SELECT *
FROM (
SELECT playerID, sum(HR) as HR
FROM BATTING
WHERE yearID > 2000
GROUP BY playerID
)
WHERE HR > 400
ORDER BY -HR
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 8 x 2
playerID HR
<chr> <int>
1 pujolal01 591
2 ortizda01 521
3 rodrial01 507
4 dunnad01 462
5 cabremi01 446
6 soriaal01 409
7 teixema01 409
8 beltrad01 403
# dplyr: arrange
batting_tbl %>%
filter(yearID > 2000) %>%
group_by(playerID) %>%
summarize( HR = sum(HR) ) %>%
## Here's the pipe from above
filter( HR > 400) %>%
arrange( desc(HR) )
# A tibble: 8 x 2
playerID HR
<chr> <int>
1 pujolal01 591
2 ortizda01 521
3 rodrial01 507
4 dunnad01 462
5 cabremi01 446
6 soriaal01 409
7 teixema01 409
8 beltrad01 403
.N
playerID N
1: johnske05 2
2: kempma01 2
3: hillaa01 2
4: coghlch01 2
5: bruceja01 2
6: gomezca01 2
7: reddijo01 2
8: lucrojo01 2
9: nunezed02 2
10: beltrca01 2
11: uptonbj01 2
In SQL we often use COUNT to determine how many records in a group meet some condition. In dplyr we can use n()
within a function to count the number of rows by group. In data.table
the special symbol .N
plays the same role.
Here we use .N
to find how many players have more than 100 AB with two or more teams in 2016.
# data.table: 20-20
batting_dt[yearID > 2000 , .(SB = sum(SB), HR = sum(HR)), .(playerID, yearID)] %>%
.[SB > 19 & HR > 19] %>% .[ , .N, yearID]
yearID N
1: 2001 15
2: 2002 9
3: 2003 6
4: 2004 9
5: 2005 5
6: 2006 7
7: 2007 14
8: 2008 9
9: 2009 14
10: 2010 7
11: 2011 12
12: 2012 10
13: 2013 9
14: 2014 5
15: 2015 4
16: 2016 9
Here we use .N
to count the number of “20-20” players in each year since 2000.
#SQL: stints example
query =
'
SELECT playerID, COUNT(teamID) as N
FROM BATTING
WHERE yearID = 2016 AND AB > 99
GROUP BY playerID
HAVING N > 1
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 11 x 2
playerID N
<chr> <int>
1 beltrca01 2
2 bruceja01 2
3 coghlch01 2
4 gomezca01 2
5 hillaa01 2
6 johnske05 2
7 kempma01 2
8 lucrojo01 2
9 nunezed02 2
10 reddijo01 2
11 uptonbj01 2
#SQL: 20-20
query =
'
SELECT yearID, COUNT(playerID) as N
FROM (
SELECT playerID, yearID, sum(SB) as SB, sum(HR) as HR
FROM BATTING
WHERE yearID > 2000
GROUP BY playerID, yearID
HAVING SB > 19 AND HR > 19
)
GROUP BY yearID
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 16 x 2
yearID N
<int> <int>
1 2001 15
2 2002 8
3 2003 4
4 2004 9
5 2005 5
6 2006 7
7 2007 14
8 2008 9
9 2009 14
10 2010 7
11 2011 12
12 2012 9
13 2013 9
14 2014 5
15 2015 4
16 2016 8
# dplyr: n()
batting_tbl %>%
filter(AB > 99 & yearID == 2016) %>%
group_by(playerID, yearID) %>%
summarize(N=n()) %>%
filter(N > 1)
# A tibble: 11 x 3
# Groups: playerID [11]
playerID yearID N
<chr> <int> <int>
1 beltrca01 2016 2
2 bruceja01 2016 2
3 coghlch01 2016 2
4 gomezca01 2016 2
5 hillaa01 2016 2
6 johnske05 2016 2
7 kempma01 2016 2
8 lucrojo01 2016 2
9 nunezed02 2016 2
10 reddijo01 2016 2
11 uptonbj01 2016 2
# dplyr: n()
batting_tbl %>%
filter( yearID > 2000) %>%
group_by(yearID, playerID) %>%
summarize( SB = sum(SB), HR = sum(HR) ) %>%
filter( SB > 19 & HR > 19) %>%
summarize( n = n() )
# A tibble: 16 x 2
yearID n
<int> <int>
1 2001 15
2 2002 9
3 2003 6
4 2004 9
5 2005 5
6 2006 7
7 2007 14
8 2008 9
9 2009 14
10 2010 7
11 2011 12
12 2012 10
13 2013 9
14 2014 5
15 2015 4
16 2016 9
Write data.table code to match each of the SQL or dplyr queries to the right.
Find the 2016 average for all playerID with team “DET” and at least 100 AB.
Find the playerID from team “DET” with the highest average in 2016, with a minimum of 100 AB.
Find the playerID who hit the most total HRs between 2001 and 2010.
Find all playerIDs with at least 200 hits in 2016 (across all stints) sorted in descending order.
Find the number of playerIDs with at least 200 hits in each year since 2000.
playerID avg
1: avilemi01 0.2095808
2: cabremi01 0.3159664
3: casteni01 0.2846715
4: collity01 0.2352941
5: iglesjo01 0.2548180
6: kinslia01 0.2880259
7: martijd02 0.3065217
8: martivi01 0.2893309
9: maybica01 0.3151862
10: mccanja02 0.2209302
11: rominan01 0.2356322
12: saltaja01 0.1707317
13: uptonju01 0.2456140
# data.table
batting_dt[ teamID == 'DET' & yearID == 2016 & AB >= 100,
.(playerID, avg = H / AB)][avg==max(avg)]
playerID avg
1: cabremi01 0.3159664
playerID HR
1: rodrial01 424
playerID H
1: altuvjo01 216
2: bettsmo01 214
3: segurje01 203
4: pedrodu01 201
# data.table
batting_dt[yearID > 1999, .(H = sum(H)), .(playerID, yearID) ][ H>=200 ] %>%
.[ , .N, yearID] %>% .[order(-yearID)]
yearID N
1: 2016 4
2: 2015 2
3: 2014 2
4: 2012 2
5: 2011 5
6: 2010 2
7: 2009 4
8: 2008 3
9: 2007 8
10: 2006 8
11: 2005 3
12: 2004 8
13: 2003 7
14: 2002 5
15: 2001 6
16: 2000 6
query =
'
SELECT playerID, ( Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING
WHERE teamID = "DET" AND yearID = 2016 AND AB > 99
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 13 x 2
playerID avg
<chr> <dbl>
1 avilemi01 0.210
2 cabremi01 0.316
3 casteni01 0.285
4 collity01 0.235
5 iglesjo01 0.255
6 kinslia01 0.288
7 martijd02 0.307
8 martivi01 0.289
9 maybica01 0.315
10 mccanja02 0.221
11 rominan01 0.236
12 saltaja01 0.171
13 uptonju01 0.246
query =
'
SELECT playerID, max( Cast (H as Float) / Cast (AB as Float) ) as avg
FROM BATTING
WHERE teamID = "DET" AND yearID = 2016 AND AB > 99
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 1 x 2
playerID avg
<chr> <dbl>
1 cabremi01 0.316
query =
'
SELECT playerID, max(HR)
FROM (
SELECT playerID, sum(HR) as HR
FROM BATTING
WHERE yearID < 2011 AND yearID > 2000
GROUP BY playerID
)
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 1 x 2
playerID `max(HR)`
<chr> <int>
1 rodrial01 424
query =
'
SELECT playerID, sum(H) as H
FROM BATTING
WHERE yearID = 2016
GROUP BY playerID
HAVING H > 199
ORDER BY -H
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 4 x 2
playerID H
<chr> <int>
1 altuvjo01 216
2 bettsmo01 214
3 segurje01 203
4 pedrodu01 201
query =
'
SELECT yearID as year, COUNT(playerID) as N
FROM (
SELECT playerID, yearID, sum(H) as H
FROM BATTING
WHERE yearID > 1999
GROUP BY playerID, yearID
HAVING H > 199
)
GROUP BY yearID
ORDER BY -yearID
'
lahman %>% tbl(sql(query)) %>% collect()
# A tibble: 16 x 2
year N
<int> <int>
1 2016 4
2 2015 2
3 2014 2
4 2012 2
5 2011 5
6 2010 2
7 2009 4
8 2008 3
9 2007 8
10 2006 8
11 2005 3
12 2004 8
13 2003 7
14 2002 5
15 2001 6
16 2000 6
batting_tbl %>%
filter(yearID == 2016 & teamID == 'DET' & AB > 100) %>%
transmute(playerID, avg = H / AB)
# A tibble: 13 x 2
playerID avg
<chr> <dbl>
1 avilemi01 0.210
2 cabremi01 0.316
3 casteni01 0.285
4 collity01 0.235
5 iglesjo01 0.255
6 kinslia01 0.288
7 martijd02 0.307
8 martivi01 0.289
9 maybica01 0.315
10 mccanja02 0.221
11 rominan01 0.236
12 saltaja01 0.171
13 uptonju01 0.246
batting_tbl %>%
filter(yearID == 2016 & teamID == 'DET' & AB > 100) %>%
transmute(playerID, avg = H / AB) %>%
filter(avg == max(avg))
# A tibble: 1 x 2
playerID avg
<chr> <dbl>
1 cabremi01 0.316
batting_tbl %>%
filter(yearID %in% 2001:2010) %>%
group_by(playerID) %>%
summarize(HR = sum(HR)) %>%
filter(HR == max(HR))
# A tibble: 1 x 2
playerID HR
<chr> <int>
1 rodrial01 424
batting_tbl %>%
filter(yearID %in% 2016) %>%
group_by(playerID) %>%
summarize(H = sum(H)) %>%
filter(H > 199) %>%
arrange( desc(H) )
# A tibble: 4 x 2
playerID H
<chr> <int>
1 altuvjo01 216
2 bettsmo01 214
3 segurje01 203
4 pedrodu01 201
batting_tbl %>%
filter(yearID > 1999) %>%
group_by(yearID, playerID) %>%
summarize(H = sum(H)) %>% # still grouped by yearIDs
filter(H > 199) %>% # or just summarize( n = sum(H > 199))
summarize( n = n() ) %>%
arrange( desc(yearID) )
# A tibble: 16 x 2
yearID n
<int> <int>
1 2016 4
2 2015 2
3 2014 2
4 2012 2
5 2011 5
6 2010 2
7 2009 4
8 2008 3
9 2007 8
10 2006 8
11 2005 3
12 2004 8
13 2003 7
14 2002 5
15 2001 6
16 2000 6