This is an example solution to Problem Set 1 for Stats 506 in Fall 2020.
To build this document, run Rscript -e "rmarkdown::render('./PS1_solution.Rmd')"
at the command line.
To run all scripts in this assignment run bash ./ps1_make.sh
.
In question 1, you were asked to write shell scripts to download and prepare data from the NHANES surveys.
The script for part “a” is ps1_q1_ohxden.sh
. This script first loops over the cohort identifiers and file name suffixes listed in ps1_nhanes_files.txt
and downloads the oral health dentition examination data for the specified years. We then use the included cutnames.sh
utility to identity the requested columns and, after verifying they all appear in the same order, extract the data in these columns and append into a single multi-cohort file.
Click to see the contents of
ps1_nhanes_files.txt
.
2011-2012 G
2013-2014 H
2015-2016 I
2017-2018 J
Click below to see the script
ps1_q1_ohxden.sh
.
#!usr/bin/env bash
# 79: -------------------------------------------------------------------------
# Problem Set 1, Question 1
#
# Download and combine NHANES data on Oral Health
# Dentition Exams and Demographics from the 2011-2012
# to 2017-2018 cycles.
#
# This is the pattern for the urls, the years and final letters are
# listed in `ps1_nhanes_files.txt`:
# > https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/OHXDEN_I.XPT
#
# For the dentition exams, we extract a subset of relevant
# variables.
#
# Author: James Henderson (jbhender@umich.edu)
# Updated: Sep 24, 2020
# 79: -------------------------------------------------------------------------
# input parameters: -----------------------------------------------------------
new_file='nhanes_ohxden.csv'
col_regex='SEQN|OHDDESTS|OHX[0-9]*TC|OHX[0-9]*CTC'
# download and prepare dentition data: ----------------------------------------
## loop over cohorts to download and convert to csv
while read cohort id; do
### Define variables
url=https://wwwn.cdc.gov/Nchs/Nhanes/$cohort/OHXDEN_$id.XPT
xpt_file=OHXDEN_$id.XPT
csv_file=OHXDEN_$id.csv
### Don't redownload if csv file is present
if [ ! -f $csv_file ]; then
### Download data if not present
if [ ! -f $xpt_file ]; then
wget $url
fi
### Convert files to csv using R
read="haven::read_xpt('$xpt_file')"
write="data.table::fwrite($read, file = '$csv_file')"
Rscript -e "$write" # Double quotes allow expansion
fi
done < ps1_nhanes_files.txt
# Extract columns and append files: -------------------------------------------
if [ -f $new_file ]; then
echo File $new_file already exists, move or delete.
else
## get the headers
while read cohort id; do
bash ./cutnames.sh OHXDEN_$id.csv $col_regex |
head -n1 >> $new_file.tmp
done < ps1_nhanes_files.txt
## verify the columns all match in the right order
if [ $(< $new_file.tmp uniq | wc -l ) != 1 ]; then
echo Matching columns are not identical: see $new_file.tmp.
else
echo Columns match, appending ...
## headers for $new_file and cleanup
< $new_file.tmp head -n1 > $new_file
rm $new_file.tmp
## add the data
while read cohort id; do
bash ./cutnames.sh OHXDEN_$id.csv $col_regex |
tail -n+2 >> $new_file
done < ps1_nhanes_files.txt
fi
fi
# 79: -------------------------------------------------------------------------
The script produces an appended csv file with the following numbers of rows and columns.
echo $(wc -l nhanes_ohxden.csv | cut -d' ' -f4) rows including header
echo $(< nhanes_ohxden.csv head -n1 | tr ',', ' ' | wc -w | tr ' ' ' ') columns
35910 rows including header
62 columns
In this part you were asked to modify the script from part “a” to download, extract, and append demographic data for the same set of cohorts. The only changes needed were to the file name and the regular expression used by cutnames.sh
for extracting columns.
Click below to see the script
ps1_q1_demo.sh
.
#!usr/bin/env bash
# 79: -------------------------------------------------------------------------
# Problem Set 1, Question 1b
#
# Download and combine NHANES Demographics data
# from the 2011-2012 to 2017-2018 cycles.
#
# This is the pattern for the urls, the years and final letters are
# listed in `ps1_nhanes_files.txt`:
# > https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT
#
# For the dentition exams, we extract a subset of relevant
# variables.
#
# Author: James Henderson (jbhender@umich.edu)
# Updated: Sep 17, 2020
# 79: -------------------------------------------------------------------------
# input parameters: -----------------------------------------------------------
new_file='nhanes_demo.csv'
wgt_regex='RIDSTATR|SDMVPSU|SDMVSTRA|WTMEC2YR|WTINT2YR'
col_regex="SEQN|RIDAGEYR|RIDRETH3|DMDEDUC2|DMDMARTL|$wgt_regex"
# download and prepare dentition data: ----------------------------------------
## loop over cohorts to download and convert to csv
while read cohort id; do
### Define variables
url=https://wwwn.cdc.gov/Nchs/Nhanes/$cohort/DEMO_$id.XPT
xpt_file=DEMO_$id.XPT
csv_file=DEMO_$id.csv
### Don't redownload if csv file is present
if [ ! -f $csv_file ]; then
### Download data if not present
if [ ! -f $xpt_file ]; then
wget $url
fi
### Convert files to csv using R
read="haven::read_xpt('$xpt_file')"
write="data.table::fwrite($read, file = '$csv_file')"
Rscript -e "$write" # Double quotes allow expansion
fi
done < ps1_nhanes_files.txt
# Extract columns and append files: -------------------------------------------
if [ -f $new_file ]; then
echo File $new_file already exists, move or delete.
else
## get the headers
while read cohort id; do
bash ./cutnames.sh DEMO_$id.csv $col_regex |
head -n1 >> $new_file.tmp
done < ps1_nhanes_files.txt
## verify the columns all match in the right order
if [ $(< $new_file.tmp uniq | wc -l ) != 1 ]; then
echo Matching columns are not identical: see $new_file.tmp.
else
echo Columns match, appending ...
## headers for $new_file and cleanup
< $new_file.tmp head -n1 > $new_file
rm $new_file.tmp
## add the data
while read cohort id; do
bash ./cutnames.sh DEMO_$id.csv $col_regex |
tail -n+2 >> $new_file
done < ps1_nhanes_files.txt
fi
fi
# 79: -------------------------------------------------------------------------
The script produces the file nhanes_demo.csv
which has the following dimensions.
echo $(wc -l nhanes_demo.csv | cut -d' ' -f4) rows including header
echo $(< nhanes_demo.csv head -n1 | tr ',', ' ' | wc -w | tr ' ' ' ') columns
39157 rows including header
10 columns
In question 2 you were asked to write functions for creating, plotting, and computing the area under ROC and PR curves. In the solution, you will see a function .perf_counts()
which counts the true and false positives and negatives associated with each unique value of the predictor yhat
. This function is used by perf_roc()
and perf_pr()
.
ps1_q2.R
.
Click to view
ps1_q2.R
.
# Stats 506, F20
# Problem Set 1
# Solution for Question 2
#
# Author: James Henderson (jbhender@umich.edu)
# Updated: September 26, 2020
# 79: -------------------------------------------------------------------------
# example data: ---------------------------------------------------------------
if ( FALSE ) {
# run for interactive testing but not when sourced
path = './'
file = sprintf('%s/isolet_results.csv', path)
isolet = read.table(file, sep = ',', header = TRUE)
}
# helper function: ------------------------------------------------------------
.perf_counts = function(yhat, y) {
# counts true and false positive and negatives for each unique value of yhat
# Inputs:
# yhat - a numeric predictor for y
# y - binary vector of true values being predicted, the larger value
# is assumed to be the target label for `y >= tau`
# Outputs: a data.frame with columns, y-hat, tp, fp, tn, fn counting the
# number of true positives, false positives, true negatives, and false
# negatives for each unique value of y-hat
# unique values
tab = table(-yhat, y)
# total samples and total positives
n = length(y)
pos = sum(tab[, 2])
# results
tp = unname( cumsum(tab[, 2]) )
fp = unname( cumsum(tab[, 1]) )
fn = pos - tp
tn = n - pos - fp
#stopifnot( all( unique( tp + fp + fn + tn) == n ) )
# returned data.frame
data.frame(yhat = sort(unique(yhat), decreasing = TRUE), tp, fp, tn, fn)
}
#with(isolet, .perf_counts(yhat, y))
# part a, function to compute sens, spec, AUC-ROC, and plot ROC: --------------
perf_roc = function(yhat, y, plot = c('none', 'base', 'ggplot2')) {
# compute area under the ROC curve and optionally produce a plot
# Inputs:
# yhat - a numeric predictor for y
# y - binary vector of true values being predicted, the larger value
# is assumed to be the target label for `y >= tau`
# plot - type of plot, if any, to produce as a side-effect, defaults
# to 'none' for no plot in which case the output list below is
# always returned. When plot = 'base' or 'ggplot2' the list below
#
# Outputs:
# A named list with the elements below, returned invisibly when not plotted.
# detail - a data.frame with columns, `yhat`, `tp`, `fp`, `tn`, `fn`
# counting the number of true positives, false positives,
# true negatives, and false negatives for each unique value
# of `yhat` and the corresponding sensitivity and specificity.
# auc_roc - the area under the ROC curve
#
# make sure y is binary and y and yhat are of equal length
stopifnot( length(unique(y)) == 2 )
stopifnot( length(y) == length(yhat) )
# resolve plotting argument
plot_type = match.arg(plot, c('none', 'base', 'ggplot2'))
# counts
df = .perf_counts(yhat, y)
# sensitivity and specificity
df[['sens']] = with(df, tp / {tp + fn})
df[['spec']] = with(df, tn / {tn + fp})
# area under the ROC curve
auc_roc = with(df,
.5 * sum(diff(1 - spec) * {sens[-1] + sens[-length(sens)]})
)
out = list(detail = df, auc_roc = auc_roc)
# plot the curves when requested
## none, always return
if ( plot_type == 'none' ) {
return(out)
}
## base, create plot
if ( plot_type == 'base' ) {
plot(sens ~ I(1 - spec),
data = df,
type = 'l',
las = 1,
xlab = '1 - specificity',
ylab = 'sensitivity'
)
segments(0, 0, 1, 1, lty = 'dashed', col = 'grey')
legend('bottomright', c(sprintf('AUC ROC = %5.3f', auc_roc)), bty = 'n')
}
## ggplot2
if ( plot_type == 'ggplot2' ) {
requireNamespace('ggplot2', quietly = FALSE)
p = ggplot(df, aes(x = 1 - spec, y = sens)) +
geom_line() +
geom_segment( aes(x = 0, y = 0, xend = 1, yend = 1),
col = 'grey', lty = 'dashed') +
geom_label(data = NULL,
aes(x = 1, y = 0, hjust = 'right',
label = sprintf('AUC ROC = %5.3f', auc_roc)),
fill = 'grey') +
xlab('1 - specificity') +
ylab('sensitivity') +
theme_bw()
print(p)
}
## return invisibly (only when assigned)
invisible(out)
}
#roc = with(isolet, perf_roc(yhat, y))
# part b, function to compute precision, recall, AUC-PR, and plot PRC: --------
perf_pr = function(yhat, y, plot = c('none', 'base', 'ggplot2')) {
# compute area under the precision recall curve and optionally produce a plot
# Inputs:
# yhat - a numeric predictor for y
# y - binary vector of true values being predicted, the larger value
# is assumed to be the target label for `y >= tau`
# plot - type of plot, if any, to produce as a side-effect, defaults
# to 'none' for no plot in which case the output list below is
# always returned. When plot = 'base' or 'ggplot2' the list below
#
# Outputs:
# A named list with the elements below, returned invisibly when not plotted.
# detail - a data.frame with columns, `yhat`, `tp`, `fp`, `tn`, `fn`
# counting the number of true positives, false positives,
# true negatives, and false negatives for each unique value
# of `yhat` and the corresponding precision and recall values.
# auc_pr - the area under the precision-recall curve
# make sure y is binary and y and yhat are of equal length
stopifnot( length(unique(y)) == 2 )
stopifnot( length(y) == length(yhat) )
# resolve plotting argument
plot_type = match.arg(plot, c('none', 'base', 'ggplot2'))
# counts
df = .perf_counts(yhat, y)
# sensitivity and specificity
df[['recall']] = with(df, tp / {tp + fn})
df[['precision']] = with(df, tp / {tp + fp})
# area under the ROC curve
auc_pr = with(df,
.5 * sum(diff(recall) * {precision[-1] + precision[-nrow(df)]})
)
out = list(detail = df, auc_pr = auc_pr)
# plot the curves when requested
## none, always return
if ( plot_type == 'none' ) {
return(out)
}
## base, create plot
if ( plot_type == 'base' ) {
plot(precision ~ recall,
data = df,
type = 'l',
las = 1,
xlab = 'recall',
ylab = 'precision',
ylim = c(0, 1)
)
legend('bottomleft', c(sprintf('AUC-PR = %5.3f', auc_pr)), bty = 'n')
}
## ggplot2
if ( plot_type == 'ggplot2' ) {
requireNamespace('ggplot2', quietly = FALSE)
p = ggplot(df, aes(x = recall, y = precision)) +
geom_line() +
geom_label(data = NULL,
aes(x = 0, y = 0, hjust = 'left',
label = sprintf('AUC-PR = %5.3f', auc_pr)),
fill = 'grey') +
xlab('recall') +
ylab('precision') +
theme_bw()
print(p)
}
## return invisibly (only when assigned)
invisible(out)
}
#pr = with(isolet, perf_pr(yhat, y))
# 79: -------------------------------------------------------------------------
# libraries: ------------------------------------------------------------------
library(tidyverse)
# functions: ------------------------------------------------------------------
source('./ps1_q2.R')
# isolet data: ----------------------------------------------------------------
path = './'
file = sprintf('%s/isolet_results.csv', path)
isolet = read.table(file, sep = ',', header = TRUE)
# areas under the curve: ------------------------------------------------------
auc = with(isolet, perf_roc(yhat, y, 'none'))$auc_roc
pr = with(isolet, perf_pr(yhat, y, 'none'))$auc_pr
For the isolet_results
data with predictions yhat
of whether y
is a vowel (1) or consonant (2), the areas under the curve are:
Use the buttons below to select the desired plot.
cap1 = '**Figure 1 (ggplot2)**. *ROC curve for the isolet data.*'
with(isolet, perf_roc(yhat, y, 'ggplot2'))
cap2 = '**Figure 1 (base).** *ROC curve for the isolet data.*'
with(isolet, perf_roc(yhat, y, 'base'))