suppressPackageStartupMessages({
  library(tidyverse)
})

About

This is an example solution to Problem Set 3 for Stats 506 in Fall 2020.

To build this document, run Rscript -e "rmarkdown::render('./PS3_solution.Rmd')" at the command line or bash ps3_make.sh to build this document after running the scripts which prepare the source data.

Data preparation

The script ps3_prep_data.do merges the demographic and dentition data and creates input data sets demo_ps3.dta and ohxden_ps3.dta used by the solutions that follow.

ps3_prep_data.do

writeLines(c('```stata', readLines("./ps3_prep_data.do"), '```'))
*-----------------------------------------------------------------------------*
* Stats 506, Fall 2020 
* Problem Set 3, data prep
* 
* This script prepares data for various parts of question 3 by:
*   - creating memorable variables names
*   - creating a missingness indicator for an incomplete dentition exam
*   - collapsing levels of education
*
* Author: James Henderson
* Updated: October 24, 2020
*-----------------------------------------------------------------------------*
// 79: ---------------------------------------------------------------------- *

// set up: ------------------------------------------------------------------ *
*cd ~/github/ps506_F20/PS3
version 16.1
log using ps3_prep_data.log, text replace

// demo data prep: ---------------------------------------------------------- *

// demographics
import delimited nhanes_demo.csv, clear

// easier to remember names
local demo_vars seqn riagendr ridageyr ridreth3 dmdeduc2 ridstatr 
local new_vars id gender age race edu exam_status
rename (`demo_vars') (`new_vars')
keep `new_vars'

// labels for race and gender
label define gender 1 "Male" 2 "Female"
label values gender gender

label define race 1 "Mexican American" 2 "Other Hispanic" ///
 3 "Non-Hispanic White" 4 "Non-Hispanic Black" 6 "Non-Hispanic Asian" ///
 7 "Other" 
label values race race

// separate age groups
generate under_20 = 0
replace under_20 = 1 if age < 20

// define college
generate college = 0 // no college unknown
replace college = 1 if edu == 4 | edu == 5
replace college = . if under_20 == 1
drop edu
label define lbl_college 0 "no college or unknown" ///
    1 "some college or college graduate"
label values college lbl_college

// remove those who did not participate in medical exams
keep if exam_status == 2

// dentition exam status: --------------------------------------------------- *

// use a frame to avoid saving a new subset to disk
frame create ohx
frame change ohx
import delimited nhanes_ohxden.csv, clear
rename (seqn ohddests) (id ohx_status)
drop *ctc

// create labels and save for use in Q3
label define tooth_count ///
  1 "Primary tooth present" 2 "Permanent tooth present" 3 "Dental Implant" ///
  4 "Tooth not present" 5 "Root fragment" 9 "could not assess"
label values ohx*tc tooth_count

gsort id
save ohxden_ps3.dta, replace

keep id ohx_status

// frlink / frget are used to link (e.g. merge) frames
frame change default
frlink 1:1 id, frame(ohx id) generate(ohx_link)
frget ohx_status, from(ohx_link)

generate ohx = 1 if ohx_status == 1 // (implicitly) & exam_status == 1
replace ohx = 0 if ohx_status != 1 | ohx_status == .

// save demographic data for ps3: ------------------------------------------- *
gsort id
save demo_ps3.dta, replace

// close log: --------------------------------------------------------------- *
log close

// 79: ---------------------------------------------------------------------- *

ps3_prep_data.log

writeLines(c('```', readLines("./ps3_prep_data.log"), '```'))
-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_prep_data.log
  log type:  text
 opened on:  30 Oct 2020, 17:23:40

. 
. // demo data prep: ----------------------------------------------------------
>  *
. 
. // demographics
. import delimited nhanes_demo.csv, clear
(11 vars, 39,156 obs)

. 
. // easier to remember names
. local demo_vars seqn riagendr ridageyr ridreth3 dmdeduc2 ridstatr 

. local new_vars id gender age race edu exam_status

. rename (`demo_vars') (`new_vars')

. keep `new_vars'

. 
. // labels for race and gender
. label define gender 1 "Male" 2 "Female"

. label values gender gender

. 
. label define race 1 "Mexican American" 2 "Other Hispanic" ///
>  3 "Non-Hispanic White" 4 "Non-Hispanic Black" 6 "Non-Hispanic Asian" ///
>  7 "Other" 

. label values race race

. 
. // separate age groups
. generate under_20 = 0

. replace under_20 = 1 if age < 20
(16,539 real changes made)

. 
. // define college
. generate college = 0 // no college unknown

. replace college = 1 if edu == 4 | edu == 5
(12,495 real changes made)

. replace college = . if under_20 == 1
(16,539 real changes made, 16,539 to missing)

. drop edu

. label define lbl_college 0 "no college or unknown" ///
>     1 "some college or college graduate"

. label values college lbl_college

. 
. // remove those who did not participate in medical exams
. keep if exam_status == 2
(1,757 observations deleted)

. 
. // dentition exam status: ---------------------------------------------------
>  *
. 
. // use a frame to avoid saving a new subset to disk
. frame create ohx

. frame change ohx

. import delimited nhanes_ohxden.csv, clear
(62 vars, 35,909 obs)

. rename (seqn ohddests) (id ohx_status)

. drop *ctc

. 
. // create labels and save for use in Q3
. label define tooth_count ///
>   1 "Primary tooth present" 2 "Permanent tooth present" 3 "Dental Implant" //
> /
>   4 "Tooth not present" 5 "Root fragment" 9 "could not assess"

. label values ohx*tc tooth_count

. 
. gsort id

. save ohxden_ps3.dta, replace
file ohxden_ps3.dta saved

. 
. keep id ohx_status

. 
. // frlink / frget are used to link (e.g. merge) frames
. frame change default

. frlink 1:1 id, frame(ohx id) generate(ohx_link)
  (1,490 observations in frame default unmatched)

. frget ohx_status, from(ohx_link)
(1,490 missing values generated)
  (1 variable copied from linked frame)

. 
. generate ohx = 1 if ohx_status == 1 // (implicitly) & exam_status == 1
(3,039 missing values generated)

. replace ohx = 0 if ohx_status != 1 | ohx_status == .
(3,039 real changes made)

. 
. // save demographic data for ps3: -------------------------------------------
>  *
. gsort id

. save demo_ps3.dta, replace
file demo_ps3.dta saved

. 
. // close log: ---------------------------------------------------------------
>  *
. log close
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_prep_data.log
  log type:  text
 closed on:  30 Oct 2020, 17:23:42
-------------------------------------------------------------------------------

Question 1

In this question, we construct a balance table for missing or incomplete dentition exams using the 4 cohort NHANES data from the previous assignment.

The primary work for this question is done in ps3_q1.do. Click below to see its contents or log.

ps3_q1.do

writeLines(c('```stata', readLines("./ps3_q1.do"), '```'))
*-----------------------------------------------------------------------------*
* Stats 506, Fall 2020 
* Problem Set 3, Question 1
* 
* Construct a balance table examining associations with missing or incomplete
* dentition examanitations in the NHANES data. 
* 
*
* Author: James Henderson
* Updated: October 25, 2020
*-----------------------------------------------------------------------------*
// 79: ---------------------------------------------------------------------- *

// set up: ------------------------------------------------------------------ *
*cd ~/github/ps506_F20/PS3
version 16.1
log using ps3_q1.log, text replace

// data prep: --------------------------------------------------------------- *
use demo_ps3.dta, clear
generate all = 1
label define all 1 "Total"
label values all all

// balance table for categorical variables: --------------------------------- *

// categorical variables
local balance_vars0 "all gender race college"
local balance_vars1 "all gender race"

// separate results for each level of under_20
foreach x in 0 1 {

    frame copy default ref 
    frame ref: drop if under_20 != `x'

    // loop over subtables
    foreach var of local balance_vars`x' {
        
        // create frame for summary data
        frame copy ref "`var'"
        
        frame `var' {
            // summarize
            generate all2=1
            collapse (sum) n=all2, by(`var' ohx) 
            reshape wide n, i("`var'") j(ohx)
            generate pct_miss = n0 / (n0 + n1) * 100
            generate pct_obs = n1 / (n0 + n1) * 100

            // chi-squared test 
            frame ref: quietly tabulate ohx `var', chi2
            generate p = r(p)
            generate chi2 = r(chi2)
            replace p = . if _n > 1
            replace chi2 = . if _n > 1
    
            // clean up names and facilitate appending
            *rename `var' level
            decode `var', generate(lev)
            generate level = lev, before(`var')
            drop `var' lev
            generate variable = "`var'", before(level)
            rename (n0 n1) (n_miss n_obs)
        
            // save temporary dta file for appending
            save ".`var'", replace
        }
        frame drop `var'        
    }
    
    // append variable tables into a single table
    frame create balance_table`x'
    frame change balance_table`x'
    use ".all", clear
    
    local append_vars0 "gender race college"
    local append_vars1 "gender race"
    
    foreach var of local append_vars`x' {
        append using ".`var'"
        erase ".`var'"  
    }
    erase ".all"
    
    export delimited "balance_table_`x'.csv", replace
    
    frame change default
    frame drop ref
    frame drop balance_table`x'
}

// balance table for continuous variables: ---------------------------------- *

// continuous age
frame copy default age
frame age {
  collapse (mean) mean=age (p25) lwr=age (p75) upr=age, by(under_20 ohx)
}

// ttest for age under 20
quietly ttest age if under_20 == 1, by(ohx)
frame age {
    generate t = `r(t)'
    generate p = `r(p)'
    list
}

// ttest for age 20+
quietly ttest age if under_20 == 0, by(ohx)
frame age {
    replace t = `r(t)' if under_20 == 0
    replace p = `r(p)' if under_20 == 0
    list
}

// one p or t per age group
frame age {
  replace p = . if mod(_n, 2) != 1
  replace t = . if mod(_n, 2) != 1  
  
  export delimited "balance_table_age.csv", replace
}
frame drop age

// close log: --------------------------------------------------------------- *
log close

// 79: ---------------------------------------------------------------------- *

ps3_q1.log

writeLines(c('```', readLines("./ps3_q1.log"), '```'))
-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_q1.log
  log type:  text
 opened on:  30 Oct 2020, 17:23:42

. 
. // data prep: ---------------------------------------------------------------
>  *
. use demo_ps3.dta, clear

. generate all = 1

. label define all 1 "Total"

. label values all all

. 
. // balance table for categorical variables: ---------------------------------
>  *
. 
. // categorical variables
. local balance_vars0 "all gender race college"

. local balance_vars1 "all gender race"

. 
. // separate results for each level of under_20
. foreach x in 0 1 {
  2. 
.         frame copy default ref 
  3.         frame ref: drop if under_20 != `x'
  4. 
.         // loop over subtables
.         foreach var of local balance_vars`x' {
  5.                 
.                 // create frame for summary data
.                 frame copy ref "`var'"
  6.                 
.                 frame `var' {
  7.                         // summarize
.                         generate all2=1
  8.                         collapse (sum) n=all2, by(`var' ohx) 
  9.                         reshape wide n, i("`var'") j(ohx)
 10.                         generate pct_miss = n0 / (n0 + n1) * 100
 11.                         generate pct_obs = n1 / (n0 + n1) * 100
 12. 
.                         // chi-squared test 
.                         frame ref: quietly tabulate ohx `var', chi2
 13.                         generate p = r(p)
 14.                         generate chi2 = r(chi2)
 15.                         replace p = . if _n > 1
 16.                         replace chi2 = . if _n > 1
 17.         
.                         // clean up names and facilitate appending
.                         *rename `var' level
.                         decode `var', generate(lev)
 18.                         generate level = lev, before(`var')
 19.                         drop `var' lev
 20.                         generate variable = "`var'", before(level)
 21.                         rename (n0 n1) (n_miss n_obs)
 22.                 
.                         // save temporary dta file for appending
.                         save ".`var'", replace
 23.                 }
 24.                 frame drop `var'                
 25.         }
 26.         
.         // append variable tables into a single table
.         frame create balance_table`x'
 27.         frame change balance_table`x'
 28.         use ".all", clear
 29.         
.         local append_vars0 "gender race college"
 30.         local append_vars1 "gender race"
 31.         
.         foreach var of local append_vars`x' {
 32.                 append using ".`var'"
 33.                 erase ".`var'"  
 34.         }
 35.         erase ".all"
 36.         
.         export delimited "balance_table_`x'.csv", replace
 37.         
.         frame change default
 38.         frame drop ref
 39.         frame drop balance_table`x'
 40. }
(15,753 observations deleted)
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                        2   ->       1
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(1 missing value generated)
(1 missing value generated)
(0 real changes made)
(0 real changes made)
(note: file .all not found)
file .all saved
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                        4   ->       2
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(1 real change made, 1 to missing)
(1 real change made, 1 to missing)
(note: file .gender not found)
file .gender saved
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                       12   ->       6
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(5 real changes made, 5 to missing)
(5 real changes made, 5 to missing)
(note: file .race not found)
file .race saved
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                        4   ->       2
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(1 real change made, 1 to missing)
(1 real change made, 1 to missing)
(note: file .college not found)
file .college saved
(note: variable variable was str3, now str6 to accommodate using data's
       values)
(note: variable level was str5, now str6 to accommodate using data's values)
(note: variable level was str6, now str18 to accommodate using data's values)
(note: variable variable was str6, now str7 to accommodate using data's
       values)
(note: variable level was str18, now str32 to accommodate using data's
       values)
file balance_table_0.csv saved
(21,646 observations deleted)
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                        2   ->       1
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(1 missing value generated)
(1 missing value generated)
(0 real changes made)
(0 real changes made)
(note: file .all not found)
file .all saved
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                        4   ->       2
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(1 real change made, 1 to missing)
(1 real change made, 1 to missing)
(note: file .gender not found)
file .gender saved
(note: j = 0 1)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                       12   ->       6
Number of variables                   3   ->       3
j variable (2 values)               ohx   ->   (dropped)
xij variables:
                                      n   ->   n0 n1
-----------------------------------------------------------------------------
(5 real changes made, 5 to missing)
(5 real changes made, 5 to missing)
(note: file .race not found)
file .race saved
(note: variable variable was str3, now str6 to accommodate using data's
       values)
(note: variable level was str5, now str6 to accommodate using data's values)
(note: variable level was str6, now str18 to accommodate using data's values)
file balance_table_1.csv saved

. 
. // balance table for continuous variables: ----------------------------------
>  *
. 
. // continuous age
. frame copy default age

. frame age {
.   collapse (mean) mean=age (p25) lwr=age (p75) upr=age, by(under_20 ohx)
. }

. 
. // ttest for age under 20
. quietly ttest age if under_20 == 1, by(ohx)

. frame age {
.         generate t = `r(t)'
.         generate p = `r(p)'
.         list

     +------------------------------------------------------+
     | under_20   ohx      mean   lwr   upr           t   p |
     |------------------------------------------------------|
  1. |        0     0   50.2631    37    63   -57.13964   0 |
  2. |        0     1   49.6465    34    64   -57.13964   0 |
  3. |        1     0   1.53292     0     0   -57.13964   0 |
  4. |        1     1   9.18119     4    14   -57.13964   0 |
     +------------------------------------------------------+
. }

. 
. // ttest for age 20+
. quietly ttest age if under_20 == 0, by(ohx)

. frame age {
.         replace t = `r(t)' if under_20 == 0
(2 real changes made)
.         replace p = `r(p)' if under_20 == 0
(2 real changes made)
.         list

     +-------------------------------------------------------------+
     | under_20   ohx      mean   lwr   upr           t          p |
     |-------------------------------------------------------------|
  1. |        0     0   50.2631    37    63    1.208826   .2267431 |
  2. |        0     1   49.6465    34    64    1.208826   .2267431 |
  3. |        1     0   1.53292     0     0   -57.13964          0 |
  4. |        1     1   9.18119     4    14   -57.13964          0 |
     +-------------------------------------------------------------+
. }

. 
. // one p or t per age group
. frame age {
.   replace p = . if mod(_n, 2) != 1
(2 real changes made, 2 to missing)
.   replace t = . if mod(_n, 2) != 1      
(2 real changes made, 2 to missing)
.   
.   export delimited "balance_table_age.csv", replace
file balance_table_age.csv saved
. }

. frame drop age

. 
. // close log: ---------------------------------------------------------------
>  *
. log close
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_q1.log
  log type:  text
 closed on:  30 Oct 2020, 17:23:42
-------------------------------------------------------------------------------

The script outputs csv files named according to the pattern balance_table_*.csv. These are assembled for presentation using R below.

## formatting functions
pretty_p = function(p, min = 1e-3, fmt = '%5.3f') {
  out = ifelse(p < min, sprintf('p < %s', fmt), sprintf('p = %s', fmt))
  ifelse(p < min, sprintf(out, min), sprintf(out, p))
}
pwc = function(n) {
  format(n, big.mark = ',')
}

## read in balance tables
files = grep("^balance_table", dir(), value = TRUE)
tab_list = lapply(files, read.csv)
names(tab_list) = 
  str_replace_all(files, '.csv', '') %>% str_replace_all('balance_table_', '')


## format categorical variable tab
tab1 = bind_rows(
  mutate(tab_list[['1']], u20 = 1),
  mutate(tab_list[['0']], u20 = 0) 
)

## categorical variables
tab = tab1 %>%
  transmute(
    variable = variable,
    Subgroup = level, 
    `Complete` = sprintf('%s (%4.1f%%)', pwc(n_obs), pct_obs),
    `Incomplete/Missing` = sprintf('%s (%4.1f%%)', pwc(n_miss), pct_miss),
    p = ifelse(is.na(p), '--', pretty_p(p)),
    u20
  )

## format age tab
tab2 = tab_list[["age"]] %>%
  transmute(
    under_20,
    ohx = ifelse(ohx == 1, 'Complete', 'Incomplete/Missing'),
    disp = sprintf('%4.1f (%i, %i)', mean, lwr, upr),
    p = pretty_p(p)
  ) 

tab_age = tab2 %>%
  select(!"p") %>%
  pivot_wider(id_cols = 'under_20', 
              names_from = 'ohx',
              values_from = disp
            ) %>%
  left_join(
    {filter(tab2, !is.na(p)) %>% select(under_20, p)}, 
    by = 'under_20'
  ) %>%
  mutate(
    variable = 'all_age', # named so after all in table
    Subgroup = 'Age, mean (IQR)',
    u20 = under_20
  ) %>%
  select(variable, Subgroup, !"under_20")

## append as a single table and arrange for presentation
tab = bind_rows(tab, tab_age) %>%
  arrange(-u20, variable)

Here is the balance table.

cap1 = paste(
  "**Table 1.** *Association of select demographics with having a complete",
  "dention exam.* Only those who participated in the medical examination are",
  "included. "
)

tab_html = tab %>%
  select(!c("variable", "u20")) %>%
  knitr::kable(format = 'html', caption = cap1, align = 'r') %>%
  kableExtra::kable_styling("striped", full_width = TRUE)

## add grouping to rows

### individual vars
n_vars = tab %>%
    group_by(u20, variable) %>%
    summarize(n = n(), .groups = 'drop') %>%
    arrange(-u20)
n_cum = cumsum(n_vars$n)
grps = which(n_vars$n > 1)

for ( g in grps ) {
  label = sprintf('%s, n (%%)', str_to_title(n_vars$variable[g]))
  tab_html = tab_html %>%
    kableExtra::group_rows(label, n_cum[g - 1] + 1, n_cum[g])
}
### under 20
n_u20 = as.vector(with(tab, table(u20)))
tab_html = tab_html %>%
  kableExtra::group_rows("Age < 20 years", 1, n_u20[2]) %>%
  kableExtra::group_rows("Age $\\ge$  20 years", n_u20[2] + 1,  sum(n_u20))

tab_html
Table 1. Association of select demographics with having a complete dention exam. Only those who participated in the medical examination are included.
Subgroup Complete Incomplete/Missing p
Age < 20 years
Total 13,991 (88.8%) 1,762 (11.2%) --
Age, mean (IQR) 9.2 (4, 14) 1.5 (0, 0) p < 0.001
Gender, n (%)
Male 7,102 (89.0%) 875 (11.0%) p = 0.383
Female 6,889 (88.6%) 887 (11.4%) --
Race, n (%)
Mexican American 2,817 (87.6%) 400 (12.4%) p < 0.001
Other Hispanic 1,495 (88.2%) 200 (11.8%) --
Non-Hispanic White 3,748 (86.9%) 566 (13.1%) --
Non-Hispanic Black 3,520 (90.5%) 369 ( 9.5%) --
Non-Hispanic Asian 1,399 (92.8%) 108 ( 7.2%) --
Other 1,012 (89.5%) 119 (10.5%) --
Age \(\ge\) 20 years
Total 20,369 (94.1%) 1,277 ( 5.9%) --
Age, mean (IQR) 49.6 (34, 64) 50.3 (37, 63) p = 0.227
College, n (%)
no college or unknown 8,983 (93.1%) 665 ( 6.9%) p < 0.001
some college or college graduate 11,386 (94.9%) 612 ( 5.1%) --
Gender, n (%)
Male 9,916 (94.9%) 538 ( 5.1%) p < 0.001
Female 10,453 (93.4%) 739 ( 6.6%) --
Race, n (%)
Mexican American 2,722 (93.3%) 197 ( 6.7%) p < 0.001
Other Hispanic 2,086 (92.7%) 165 ( 7.3%) --
Non-Hispanic White 7,585 (95.5%) 359 ( 4.5%) --
Non-Hispanic Black 4,670 (94.3%) 283 ( 5.7%) --
Non-Hispanic Asian 2,558 (91.4%) 241 ( 8.6%) --
Other 748 (95.9%) 32 ( 4.1%) --

Question 2

In this question, we use logistic regression to model the probability of a complete dentition exam as a function of the demographic variables.

The primary work for this question is done in ps3_q2.do. Click below to see its contents or log.

ps3_q2.do

writeLines(c('```stata', readLines("./ps3_q2.do"), '```'))
*-----------------------------------------------------------------------------*
* Stats 506, Fall 2020 
* Problem Set 3, Question 2
* 
* Model the probability of an incomplete or missing dentition exam in the
* NHANES data. 
*
* Author: James Henderson
* Updated: October 25, 2020
*-----------------------------------------------------------------------------*
// 79: ---------------------------------------------------------------------- *

// set up: ------------------------------------------------------------------ *
cd ~/github/ps506_F20/PS3
version 16.1
log using ps3_q2.log, text replace

// data prep: --------------------------------------------------------------- *
use demo_ps3.dta, clear

// center and re-scale age
quietly: summarize age
generate age_c = (age - `r(mean)') / 10

quietly: summarize age if under_20 == 0
generate age_c0 = (age - `r(mean)') / 10 * (1 - under_20)

quietly: summarize age if under_20 == 1
generate age_c1 = (age - `r(mean)') / 10 * under_20

replace college = 0 if under_20 == 1

// note, all missing for age == 0, drop these: ------------------------------ *
drop if age == 0

// models for missing: ------------------------------------------------------ *

// write results to xlsx file
putexcel set ps3_q2.xlsx, replace sheet("AIC")
putexcel A1 = "model"
putexcel B1 = "description"
putexcel C1 = "AIC"

// mod1: 
logistic ohx i.under_20 i.gender c.age_c##c.age_c i.college
putexcel A2 = "`e(cmdline)'"
putexcel B2 = "base model (m1)"
putexcel C2 = (-2 * `e(ll)' + 2 * `e(rank)')
estat ic // for the log, not really needed when run in batch mode

// store these results until a better model comes along
estimates store best_model
local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
local top_model = "mod1"

// mod2: 
logistic ohx i.under_20 i.gender c.age_c##c.age_c c.age_c##i.college
putexcel A3 = "`e(cmdline)'"
putexcel B3 = "m1 + age/college interaction"
putexcel C3 = (-2 * `e(ll)' + 2 * `e(rank)')
estat ic 

// is this model better by AIC? 
if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
    estimate store best_model
    local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
    local top_model = "mod2"
}

// mod3: 
logistic ohx i.under_20 i.gender c.age_c##c.age_c c.age_c#i.gender i.college
putexcel A4 = "`e(cmdline)'"
putexcel B4 = "m1 + age/gender interaction"
putexcel C4 = (-2 * `e(ll)' + 2 * `e(rank)')
estat ic

// is this model better by AIC? 
if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
    estimate store best_model
    local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
    local top_model = "mod3"
}

// mod4: 
logistic ohx i.under_20 i.gender c.age_c##c.age_c i.college##i.gender
putexcel A5 = "`e(cmdline)'"
putexcel B5 = "m1 + gender/college interaction"
putexcel C5 = (-2 * `e(ll)' + 2 * `e(rank)')
estat ic

// is this model better by AIC? 
if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
    estimate store best_model
    local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
    local top_model = "mod4"
}

// mod5: 
logistic ohx i.under_20 i.gender i.college c.age_c##c.age_c#i.under_20
putexcel A6 = "`e(cmdline)'"
putexcel B6 = "m1 + 'under 20'/age(both linear and quadratic) interaction"
putexcel C6 = (-2 * `e(ll)' + 2 * `e(rank)')
estat ic

// is this model better by AIC? 
if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
    estimate store best_model
    local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
    local top_model = "mod5"
}


// mod6:
logistic ohx i.under_20#i.gender i.college c.age_c##c.age_c ///
    c.age_c#c.age_c#c.age_c
putexcel A7 = "`e(cmdline)'"
putexcel B7 = "m1 + cubic age"
putexcel C7 = (-2 * `e(ll)' + 2 * `e(rank)')
estat ic

// is this model better by AIC? 
if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
    estimate store best_model
    local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
    local top_model = "mod6"
}

// format and save best model: ---------------------------------------------- *
estimates restore best_model
display "`top_model'"
 
// collect results
mata:
 b = st_matrix("e(b)")
 v = diagonal(st_matrix("e(V)"))
 se = sqrt(v)
 lwr = b' - 1.96 * se
 upr = b' + 1.96 * se
 coef = (b', se, lwr, upr) 
 st_matrix("coef_tab", coef)
 st_matrixrowstripe("coef_tab", st_matrixcolstripe("e(b)"))
end

// write to Excel
putexcel set ps3_q2.xlsx, modify sheet("Top Model")
putexcel A1 = "Top Model"
putexcel A2 = "`top_model'"
putexcel B1 = "response"
putexcel C1 = "Term"
putexcel D1 = "est"
putexcel E1 = "se"
putexcel F1 = "lwr"
putexcel G1 = "upr"
putexcel B2 = matrix(coef_tab), rownames

// An alternative built in way to output results. 
putexcel set ps3_q2.xlsx, modify sheet("Top Model etable")
estimates replay best_model
putexcel A1 = etable

// close log: --------------------------------------------------------------- *
log close

// 79: ---------------------------------------------------------------------- *

ps3_q2.log

writeLines(c('```', readLines("./ps3_q2.log"), '```'))
-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_q2.log
  log type:  text
 opened on:  30 Oct 2020, 17:23:42

. 
. // data prep: ---------------------------------------------------------------
>  *
. use demo_ps3.dta, clear

. 
. // center and re-scale age
. quietly: summarize age

. generate age_c = (age - `r(mean)') / 10

. 
. quietly: summarize age if under_20 == 0

. generate age_c0 = (age - `r(mean)') / 10 * (1 - under_20)

. 
. quietly: summarize age if under_20 == 1

. generate age_c1 = (age - `r(mean)') / 10 * under_20

. 
. replace college = 0 if under_20 == 1
(15,753 real changes made)

. 
. // note, all missing for age == 0, drop these: ------------------------------
>  *
. drop if age == 0
(1,490 observations deleted)

. 
. // models for missing: ------------------------------------------------------
>  *
. 
. // write results to xlsx file
. putexcel set ps3_q2.xlsx, replace sheet("AIC")
Note: file will be replaced when the first putexcel command is issued

. putexcel A1 = "model"
file ps3_q2.xlsx saved

. putexcel B1 = "description"
file ps3_q2.xlsx saved

. putexcel C1 = "AIC"
file ps3_q2.xlsx saved

. 
. // mod1: 
. logistic ohx i.under_20 i.gender c.age_c##c.age_c i.college

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(5)        =     443.54
                                                Prob > chi2       =     0.0000
Log likelihood = -6162.4232                     Pseudo R2         =     0.0347

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
  1.under_20 |   2.168888   .3206005     5.24   0.000     1.623358    2.897744
             |
      gender |
     Female  |   .7897955   .0416511    -4.47   0.000     .7122381    .8757984
       age_c |   .8633734   .0287313    -4.41   0.000     .8088583    .9215627
             |
     c.age_c#|
     c.age_c |   1.038839   .0086063     4.60   0.000     1.022107    1.055844
             |
     college |
some coll..  |   1.395435   .0812832     5.72   0.000     1.244881    1.564198
       _cons |   15.65632   .9326263    46.18   0.000     13.93108    17.59522
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A2 = "`e(cmdline)'"
file ps3_q2.xlsx saved

. putexcel B2 = "base model (m1)"
file ps3_q2.xlsx saved

. putexcel C2 = (-2 * `e(ll)' + 2 * `e(rank)')
file ps3_q2.xlsx saved

. estat ic // for the log, not really needed when run in batch mode

Akaike's information criterion and Bayesian information criterion

-----------------------------------------------------------------------------
       Model |          N   ll(null)  ll(model)      df        AIC        BIC
-------------+---------------------------------------------------------------
           . |     35,909  -6384.193  -6162.423       6   12336.85   12387.78
-----------------------------------------------------------------------------
Note: BIC uses N = number of observations. See [R] BIC note.

. 
. // store these results until a better model comes along
. estimates store best_model

. local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')

. local top_model = "mod1"

. 
. // mod2: 
. logistic ohx i.under_20 i.gender c.age_c##c.age_c c.age_c##i.college
note: age_c omitted because of collinearity

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(6)        =     445.61
                                                Prob > chi2       =     0.0000
Log likelihood = -6161.3883                     Pseudo R2         =     0.0349

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
  1.under_20 |   1.969615   .3213247     4.15   0.000     1.430591    2.711734
             |
      gender |
     Female  |   .7921543   .0418083    -4.41   0.000     .7143073    .8784852
       age_c |   .8438539   .0312812    -4.58   0.000     .7847182     .907446
             |
     c.age_c#|
     c.age_c |   1.038892   .0086055     4.61   0.000     1.022161    1.055896
             |
       age_c |          1  (omitted)
             |
     college |
some coll..  |   1.279037   .1075942     2.93   0.003     1.084623    1.508299
             |
     college#|
     c.age_c |
some coll..  |   1.050421   .0359593     1.44   0.151     .9822548    1.123319
             |
       _cons |   16.35849   1.106994    41.30   0.000     14.32655    18.67862
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A3 = "`e(cmdline)'"
file ps3_q2.xlsx saved

. putexcel B3 = "m1 + age/college interaction"
file ps3_q2.xlsx saved

. putexcel C3 = (-2 * `e(ll)' + 2 * `e(rank)')
file ps3_q2.xlsx saved

. estat ic 

Akaike's information criterion and Bayesian information criterion

-----------------------------------------------------------------------------
       Model |          N   ll(null)  ll(model)      df        AIC        BIC
-------------+---------------------------------------------------------------
           . |     35,909  -6384.193  -6161.388       7   12336.78    12396.2
-----------------------------------------------------------------------------
Note: BIC uses N = number of observations. See [R] BIC note.

. 
. // is this model better by AIC? 
. if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
.         estimate store best_model
.         local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
.         local top_model = "mod2"
. }

. 
. // mod3: 
. logistic ohx i.under_20 i.gender c.age_c##c.age_c c.age_c#i.gender i.college

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(6)        =     444.81
                                                Prob > chi2       =     0.0000
Log likelihood = -6161.7888                     Pseudo R2         =     0.0348

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
  1.under_20 |   2.166338   .3201651     5.23   0.000     1.621535    2.894183
             |
      gender |
     Female  |   .8126902   .0475282    -3.55   0.000     .7246772    .9113925
       age_c |   .8760372   .0312277    -3.71   0.000     .8169212    .9394311
             |
     c.age_c#|
     c.age_c |   1.039045   .0086115     4.62   0.000     1.022303    1.056061
             |
      gender#|
     c.age_c |
     Female  |   .9732633    .023425    -1.13   0.260     .9284173    1.020276
             |
     college |
some coll..  |   1.394066   .0812181     5.70   0.000     1.243633    1.562695
       _cons |   15.41867    .939054    44.92   0.000     13.68376    17.37354
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A4 = "`e(cmdline)'"
file ps3_q2.xlsx saved

. putexcel B4 = "m1 + age/gender interaction"
file ps3_q2.xlsx saved

. putexcel C4 = (-2 * `e(ll)' + 2 * `e(rank)')
file ps3_q2.xlsx saved

. estat ic

Akaike's information criterion and Bayesian information criterion

-----------------------------------------------------------------------------
       Model |          N   ll(null)  ll(model)      df        AIC        BIC
-------------+---------------------------------------------------------------
           . |     35,909  -6384.193  -6161.789       7   12337.58      12397
-----------------------------------------------------------------------------
Note: BIC uses N = number of observations. See [R] BIC note.

. 
. // is this model better by AIC? 
. if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
.         estimate store best_model
.         local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
.         local top_model = "mod3"
. }

. 
. // mod4: 
. logistic ohx i.under_20 i.gender c.age_c##c.age_c i.college##i.gender

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(6)        =     444.62
                                                Prob > chi2       =     0.0000
Log likelihood = -6161.8815                     Pseudo R2         =     0.0348

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
  1.under_20 |   2.165201    .320116     5.23   0.000     1.620509    2.892976
             |
      gender |
     Female  |   .8251306   .0556223    -2.85   0.004     .7230077    .9416782
       age_c |   .8631518   .0287278    -4.42   0.000     .8086435    .9213344
             |
     c.age_c#|
     c.age_c |   1.038703   .0086063     4.58   0.000     1.021971    1.055708
             |
     college |
some coll..  |   1.488576   .1272183     4.65   0.000     1.258997    1.760019
             |
     college#|
      gender |
some coll.. #|
     Female  |   .8932307   .0970066    -1.04   0.298     .7219739    1.105111
             |
       _cons |   15.31522   .9636251    43.37   0.000     13.53836    17.32528
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A5 = "`e(cmdline)'"
file ps3_q2.xlsx saved

. putexcel B5 = "m1 + gender/college interaction"
file ps3_q2.xlsx saved

. putexcel C5 = (-2 * `e(ll)' + 2 * `e(rank)')
file ps3_q2.xlsx saved

. estat ic

Akaike's information criterion and Bayesian information criterion

-----------------------------------------------------------------------------
       Model |          N   ll(null)  ll(model)      df        AIC        BIC
-------------+---------------------------------------------------------------
           . |     35,909  -6384.193  -6161.882       7   12337.76   12397.18
-----------------------------------------------------------------------------
Note: BIC uses N = number of observations. See [R] BIC note.

. 
. // is this model better by AIC? 
. if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
.         estimate store best_model
.         local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
.         local top_model = "mod4"
. }

. 
. // mod5: 
. logistic ohx i.under_20 i.gender i.college c.age_c##c.age_c#i.under_20

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(7)        =     444.08
                                                Prob > chi2       =     0.0000
Log likelihood = -6162.1523                     Pseudo R2         =     0.0348

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
  1.under_20 |    2.76122    3.04683     0.92   0.357     .3175811    24.00753
             |
      gender |
     Female  |   .7898223   .0416546    -4.47   0.000     .7122585    .8758326
             |
     college |
some coll..  |   1.395286   .0812779     5.72   0.000     1.244742    1.564038
             |
    under_20#|
     c.age_c |
          0  |   .8484016   .0349928    -3.99   0.000      .782516    .9198346
          1  |   1.013708   1.039576     0.01   0.989     .1358284     7.56546
             |
    under_20#|
     c.age_c#|
     c.age_c |
          0  |   1.043404   .0107374     4.13   0.000     1.022569    1.064662
          1  |   1.061035   .2421755     0.26   0.795     .6783399    1.659632
             |
       _cons |   15.73226   .9451997    45.87   0.000     13.98462     17.6983
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A6 = "`e(cmdline)'"
file ps3_q2.xlsx saved

. putexcel B6 = "m1 + 'under 20'/age(both linear and quadratic) interaction"
file ps3_q2.xlsx saved

. putexcel C6 = (-2 * `e(ll)' + 2 * `e(rank)')
file ps3_q2.xlsx saved

. estat ic

Akaike's information criterion and Bayesian information criterion

-----------------------------------------------------------------------------
       Model |          N   ll(null)  ll(model)      df        AIC        BIC
-------------+---------------------------------------------------------------
           . |     35,909  -6384.193  -6162.152       8    12340.3   12408.21
-----------------------------------------------------------------------------
Note: BIC uses N = number of observations. See [R] BIC note.

. 
. // is this model better by AIC? 
. if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
.         estimate store best_model
.         local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
.         local top_model = "mod5"
. }

. 
. 
. // mod6:
. logistic ohx i.under_20#i.gender i.college c.age_c##c.age_c ///
>         c.age_c#c.age_c#c.age_c

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(7)        =     446.83
                                                Prob > chi2       =     0.0000
Log likelihood = -6160.7759                     Pseudo R2         =     0.0350

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
    under_20#|
      gender |
   0#Female  |   .7558556    .044239    -4.78   0.000     .6739371    .8477314
     1#Male  |   1.863768   .3194154     3.63   0.000     1.332023    2.607787
   1#Female  |   1.788205   .3045442     3.41   0.001     1.280712    2.496796
             |
     college |
some coll..  |   1.399161   .0815504     5.76   0.000     1.248116    1.568484
       age_c |   .8665719   .0296992    -4.18   0.000     .8102744    .9267809
             |
     c.age_c#|
     c.age_c |   1.043855   .0137724     3.25   0.001     1.017207      1.0712
             |
     c.age_c#|
     c.age_c#|
     c.age_c |   .9986963   .0027936    -0.47   0.641     .9932359    1.004187
             |
       _cons |   15.90378   1.015895    43.31   0.000     14.03227    18.02491
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A7 = "`e(cmdline)'"
file ps3_q2.xlsx saved

. putexcel B7 = "m1 + cubic age"
file ps3_q2.xlsx saved

. putexcel C7 = (-2 * `e(ll)' + 2 * `e(rank)')
file ps3_q2.xlsx saved

. estat ic

Akaike's information criterion and Bayesian information criterion

-----------------------------------------------------------------------------
       Model |          N   ll(null)  ll(model)      df        AIC        BIC
-------------+---------------------------------------------------------------
           . |     35,909  -6384.193  -6160.776       8   12337.55   12405.46
-----------------------------------------------------------------------------
Note: BIC uses N = number of observations. See [R] BIC note.

. 
. // is this model better by AIC? 
. if `best_aic' > (-2 * `e(ll)' + 2 * `e(rank)') {
.         estimate store best_model
.         local best_aic = (-2 * `e(ll)' + 2 * `e(rank)')
.         local top_model = "mod6"
. }

. 
. // format and save best model: ----------------------------------------------
>  *
. estimates restore best_model
(results best_model are active now)

. display "`top_model'"
mod2

.  
. // collect results
. mata:
------------------------------------------------- mata (type end to exit) -----
:  b = st_matrix("e(b)")

:  v = diagonal(st_matrix("e(V)"))

:  se = sqrt(v)

:  lwr = b' - 1.96 * se

:  upr = b' + 1.96 * se

:  coef = (b', se, lwr, upr) 

:  st_matrix("coef_tab", coef)

:  st_matrixrowstripe("coef_tab", st_matrixcolstripe("e(b)"))

: end
-------------------------------------------------------------------------------

. 
. // write to Excel
. putexcel set ps3_q2.xlsx, modify sheet("Top Model")

. putexcel A1 = "Top Model"
file ps3_q2.xlsx saved

. putexcel A2 = "`top_model'"
file ps3_q2.xlsx saved

. putexcel B1 = "response"
file ps3_q2.xlsx saved

. putexcel C1 = "Term"
file ps3_q2.xlsx saved

. putexcel D1 = "est"
file ps3_q2.xlsx saved

. putexcel E1 = "se"
file ps3_q2.xlsx saved

. putexcel F1 = "lwr"
file ps3_q2.xlsx saved

. putexcel G1 = "upr"
file ps3_q2.xlsx saved

. putexcel B2 = matrix(coef_tab), rownames
file ps3_q2.xlsx saved

. 
. // An alternative built in way to output results. 
. putexcel set ps3_q2.xlsx, modify sheet("Top Model etable")

. estimates replay best_model

-------------------------------------------------------------------------------
Model best_model
-------------------------------------------------------------------------------

note: age_c omitted because of collinearity

Logistic regression                             Number of obs     =     35,909
                                                LR chi2(6)        =     445.61
                                                Prob > chi2       =     0.0000
Log likelihood = -6161.3883                     Pseudo R2         =     0.0349

------------------------------------------------------------------------------
         ohx | Odds Ratio   Std. Err.      z    P>|z|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
  1.under_20 |   1.969615   .3213247     4.15   0.000     1.430591    2.711734
             |
      gender |
     Female  |   .7921543   .0418083    -4.41   0.000     .7143073    .8784852
       age_c |   .8438539   .0312812    -4.58   0.000     .7847182     .907446
             |
     c.age_c#|
     c.age_c |   1.038892   .0086055     4.61   0.000     1.022161    1.055896
             |
       age_c |          1  (omitted)
             |
     college |
some coll..  |   1.279037   .1075942     2.93   0.003     1.084623    1.508299
             |
     college#|
     c.age_c |
some coll..  |   1.050421   .0359593     1.44   0.151     .9822548    1.123319
             |
       _cons |   16.35849   1.106994    41.30   0.000     14.32655    18.67862
------------------------------------------------------------------------------
Note: _cons estimates baseline odds.

. putexcel A1 = etable
file ps3_q2.xlsx saved

. 
. // close log: ---------------------------------------------------------------
>  *
. log close
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_q2.log
  log type:  text
 closed on:  30 Oct 2020, 17:23:47
-------------------------------------------------------------------------------

These scripts outputs the summary file ps3_q2.xlsx with tabs: * AIC - the models considered and their AIC's, * Top Model - regression output for the top model, * Top Model etable - alternative regression output.

The final table is included here primarily as a demonstration.

In the regression models, please note the following: - as Table 1 suggests, there are no dentition exams for those under age 1; these participants have been excluded as they are missing structurally. - to make the corresponding odds ratios easier to interpret, age has been centered and scaled to 10-year units.

The code below reads and formats these results for display.

## AIC
aic_tab = readxl::read_xlsx('./ps3_q2.xlsx', sheet = 'AIC')
aic_tab = aic_tab %>%
  rename( `Stata Command` = model ) %>%
  mutate( model = paste0('m', 1:n()), delta = AIC - min(AIC)) %>%
  select(model, everything())

## regression table
term_map = c(
  "1.under_20" = "Under 20", 
  "2.gender" = "Female",
  "age_c" = "Age in decades, (centered)",
  "c.age_c#c.age_c" = "Age squared",
  "1.college" = "Some college or College graduate",
  "1.college#c.age_c" = "Age x College",
  "_cons" = "Reference Probability (Age 32/Male/No college)"
)
reg_tab = readxl::read_xlsx('./ps3_q2.xlsx', sheet = 'Top Model')
reg_tab = reg_tab %>%
  filter( !{grepl('b[.]', Term) | grepl('^o[.]', Term)}) %>%
  transmute(
    Covariate = term_map[Term],
    `OR (95% CI)` = 
      ifelse(Term == '_cons', 
       sprintf('%4.2f (%4.2f, %4.2f)', plogis(est), plogis(lwr), plogis(upr)),
       sprintf('%4.2f (%4.2f, %4.2f)', exp(est), exp(lwr), exp(upr))
      )
  )
cap2 = paste(
  "**Table 2.** *Odds Ratios for a Complete Dentition Exam.*",
  "Overall the probability of a complete dentition exam is high. Those",
  "under 20, males, and college attendance were associated with higher",
  "probablity of a complete exam. Increasing age is associated with a lower",
  "probability of a complete exam for those under ~60 (see marginal effects",
  "in the log, not included here.)"
)
knitr::kable(reg_tab, caption = cap2) %>%
  kableExtra::kable_styling("striped")
Table 2. Odds Ratios for a Complete Dentition Exam. Overall the probability of a complete dentition exam is high. Those under 20, males, and college attendance were associated with higher probablity of a complete exam. Increasing age is associated with a lower probability of a complete exam for those under ~60 (see marginal effects in the log, not included here.)
Covariate OR (95% CI)
Under 20 1.97 (1.43, 2.71)
Female 0.79 (0.71, 0.88)
Age in decades, (centered) 0.84 (0.78, 0.91)
Age squared 1.04 (1.02, 1.06)
Some college or College graduate 1.28 (1.08, 1.51)
Age x College 1.05 (0.98, 1.12)
Reference Probability (Age 32/Male/No college) 0.94 (0.93, 0.95)
cap3 = paste(
  "**Table 3.** *AIC values for each logistic regression models considered.*",
  "Model 2 has the smallest AIC, but model 1 is likley preferred for",
  "simplicity as the improvement in model fit is small."
)
knitr::kable(aic_tab, caption = cap3) %>%
  kableExtra::kable_styling("striped")
Table 3. AIC values for each logistic regression models considered. Model 2 has the smallest AIC, but model 1 is likley preferred for simplicity as the improvement in model fit is small.
model Stata Command description AIC delta
m1 logistic ohx i.under_20 i.gender c.age_c##c.age_c i.college base model (m1) 12336.85 0.0698444
m2 logistic ohx i.under_20 i.gender c.age_c##c.age_c c.age_c##i.college m1 + age/college interaction 12336.78 0.0000000
m3 logistic ohx i.under_20 i.gender c.age_c##c.age_c c.age_c#i.gender i.college m1 + age/gender interaction 12337.58 0.8009346
m4 logistic ohx i.under_20 i.gender c.age_c##c.age_c i.college##i.gender m1 + gender/college interaction 12337.76 0.9863900
m5 logistic ohx i.under_20 i.gender i.college c.age_c##c.age_c#i.under_20 m1 + 'under 20'/age(both linear and quadratic) interaction 12340.30 3.5279267
m6 logistic ohx i.under_20#i.gender i.college c.age_c##c.age_c c.age_c#c.age_c#c.age_c m1 + cubic age 12337.55 0.7751267

Question 3

In the final question, we create a visualization for a descriptive investigation of how tooth status is associated with age.

The results to be plotted are in ps3_q3_results.csv and are produced by ps3_q3.do. You can view this script or its log below.

ps3_q3.do

writeLines(c('```stata', readLines("./ps3_q3.do"), '```'))
*-----------------------------------------------------------------------------*
* Stats 506, Fall 2020 
* Problem Set 3, Question 3
* 
* This script uses the four-wave NHANES data and the dentition exams to
* descriptively compare how the tooth status changes with age for each
* tooth. Age is discretized into ~6 year windows. 
* 
* Author: James Henderson
* Updated: October 29, 2020
*-----------------------------------------------------------------------------*
// 79: ---------------------------------------------------------------------- *

// set up: ------------------------------------------------------------------ *
*cd ~/path/to/your/files // comment out before submission 
version 16.1
log using ps3_q3.log, text replace

// data prep: --------------------------------------------------------------- *
use ps3_ohxden, clear
gsort id
merge 1:1 id using demo_ps3, keepusing(age)
keep if _merge == 3
drop _merge
keep if ohx_status == 1
drop ohx_status

// create age groups: ------------------------------------------------------- *
generate age_grp = .

foreach upr of numlist 1/14 {
    local start = 6 * (`upr' - 1)
    local stop = 6 * `upr'
    display `start'
    display `stop'
    label define lbl_age_grp `upr' "[`start'-`stop')", add
    replace age_grp = `upr' if age >= `start' & age < `stop'    
}
label values age_grp lbl_age_grp

// count frquencies for each tooth and age group: --------------------------- *
reshape long ohx, i(id age_grp) j(tooth) string
rename ohx ohx_tc
collapse (count) n=id, by(tooth age_grp ohx_tc)

// count totals for each tooth/age group combo: ----------------------------- *
frame copy default total
frame total: collapse (sum) total=n, by(tooth age_grp)

frlink m:1 tooth age_grp, frame(total tooth age_grp) generate(total_link)
frget total, from(total_link)

// "complete" the data with implicit zeros
reshape wide n, i(age_grp tooth) j(ohx_tc)
foreach i of numlist 1/5 {
    replace n`i' = 0 if n`i' == .
}
reshape long n, i(age_grp tooth) j(ohx_tc)

// compute percentages: ----------------------------------------------------- *
generate p = n / total * 100

// truncated confidence intervals
generate stderr = sqrt(p * (100 - p) / total)
generate lwr = p - 1.96 * stderr
generate upr = p + 1.96 * stderr
replace lwr = 0 if lwr < 0
replace upr = 100 if upr > 100

// a little more clean up
drop total_link total

// export results
export delimited using ps3_q3_results.csv

// close log: --------------------------------------------------------------- *
log close

// 79: ---------------------------------------------------------------------- *

ps3_q3.log

writeLines(c('```', readLines("./ps3_q3.log"), '```'))
-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /Users/jbhender/github/ps506_F20/PS3/ps3_q3.log
  log type:  text
 opened on:  30 Oct 2020, 17:23:47

. 
. // data prep: ---------------------------------------------------------------
>  *
. use ps3_ohxden, clear

. gsort id

. merge 1:1 id using demo_ps3, keepusing(age)

    Result                           # of obs.
    -----------------------------------------
    not matched                         1,490
        from master                         0  (_merge==1)
        from using                      1,490  (_merge==2)

    matched                            35,909  (_merge==3)
    -----------------------------------------

. keep if _merge == 3
(1,490 observations deleted)

. drop _merge

. keep if ohx_status == 1
(1,549 observations deleted)

. drop ohx_status

. 
. // create age groups: -------------------------------------------------------
>  *
. generate age_grp = .
(34,360 missing values generated)

. 
. foreach upr of numlist 1/14 {
  2.         local start = 6 * (`upr' - 1)
  3.         local stop = 6 * `upr'
  4.         display `start'
  5.         display `stop'
  6.         label define lbl_age_grp `upr' "[`start'-`stop')", add
  7.         replace age_grp = `upr' if age >= `start' & age < `stop'        
  8. }
0
6
(4,274 real changes made)
6
12
(4,803 real changes made)
12
18
(3,784 real changes made)
18
24
(2,515 real changes made)
24
30
(2,035 real changes made)
30
36
(2,083 real changes made)
36
42
(2,036 real changes made)
42
48
(1,987 real changes made)
48
54
(1,965 real changes made)
54
60
(1,995 real changes made)
60
66
(2,374 real changes made)
66
72
(1,714 real changes made)
72
78
(1,195 real changes made)
78
84
(1,600 real changes made)

. label values age_grp lbl_age_grp

. 
. // count frquencies for each tooth and age group: ---------------------------
>  *
. reshape long ohx, i(id age_grp) j(tooth) string
(note: j = 01tc 02tc 03tc 04tc 05tc 06tc 07tc 08tc 09tc 10tc 11tc 12tc 13tc 14t
> c 15tc 16tc 17tc 18tc 19tc 20tc 21tc 22tc 23tc 24tc 25tc 26tc 27tc 28tc 29tc 
> 30tc 31tc 32tc)

Data                               wide   ->   long
-----------------------------------------------------------------------------
Number of obs.                    34360   -> 1.1e+06
Number of variables                  35   ->       5
j variable (32 values)                    ->   tooth
xij variables:
            ohx01tc ohx02tc ... ohx32tc   ->   ohx
-----------------------------------------------------------------------------

. rename ohx ohx_tc

. collapse (count) n=id, by(tooth age_grp ohx_tc)

. 
. // count totals for each tooth/age group combo: -----------------------------
>  *
. frame copy default total

. frame total: collapse (sum) total=n, by(tooth age_grp)

. 
. frlink m:1 tooth age_grp, frame(total tooth age_grp) generate(total_link)
  (all observations in frame default matched)

. frget total, from(total_link)
  (1 variable copied from linked frame)

. 
. // "complete" the data with implicit zeros
. reshape wide n, i(age_grp tooth) j(ohx_tc)
(note: j = 1 2 3 4 5)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                     1608   ->     448
Number of variables                   6   ->       9
j variable (5 values)            ohx_tc   ->   (dropped)
xij variables:
                                      n   ->   n1 n2 ... n5
-----------------------------------------------------------------------------

. foreach i of numlist 1/5 {
  2.         replace n`i' = 0 if n`i' == .
  3. }
(319 real changes made)
(21 real changes made)
(214 real changes made)
(0 real changes made)
(78 real changes made)

. reshape long n, i(age_grp tooth) j(ohx_tc)
(note: j = 1 2 3 4 5)

Data                               wide   ->   long
-----------------------------------------------------------------------------
Number of obs.                      448   ->    2240
Number of variables                   9   ->       6
j variable (5 values)                     ->   ohx_tc
xij variables:
                           n1 n2 ... n5   ->   n
-----------------------------------------------------------------------------

. 
. // compute percentages: -----------------------------------------------------
>  *
. generate p = n / total * 100

. 
. // truncated confidence intervals
. generate stderr = sqrt(p * (100 - p) / total)

. generate lwr = p - 1.96 * stderr

. generate upr = p + 1.96 * stderr

. replace lwr = 0 if lwr < 0
(189 real changes made)

. replace upr = 100 if upr > 100
(3 real changes made)

. 
. // a little more clean up
. drop total_link total

. 
. // export results
. export delimited using ps3_q3_results.csv
file ps3_q3_results.csv already exists
r(602);

end of do-file
r(602);

The results are read in using the code below.

q3 = read_delim("ps3_q3_results.csv", delim = ',')

types = c(
  paste(c('3rd', '2nd', '1st'), 'molar'),
  paste(c('2nd', '1st'), 'bicuspid'),
  'cuspid',
  'lateral incisor',
  'central incisor'
)
types_all = c(types, rev(types), types, rev(types))
rl = c(rep('right', 8), rep('left', 16), rep('right', 8))

q3_clean = q3 %>%
  mutate(
    age = str_replace_all(
      str_split(age_grp, '-', 2, simplify = TRUE)[, 2], '\\)', ''),
    age = as.numeric(age),
    tooth_num = as.numeric(str_sub(tooth, 1, 2)),
    type = types_all[tooth_num],
    right_left = rl[tooth_num],
    upr_lwr = ifelse(tooth_num > 16, 'upper', 'lower'),
    tooth_name = paste(upr_lwr, right_left, type),
    Status = ohx_tc
  ) 

Figures 1a-c presents a visualization of this analysis, split up by tooth type.

Figure 1

Molars

cap_template = paste(
  "**Figure %s.** *Tooth status as a function of age for %s.*",
  "Lines represents a linear interpolation of 6-year age buckets with the",
  "shaded areas providing (pointwise) 95%% confidence intervals."
)
cap1a = sprintf(cap_template, "1a", "molars")
q3_clean %>%
  filter(grepl('molar', tooth_name)) %>%
  ggplot(aes(x = age, y = p, group = Status, color = Status) )  +
  geom_ribbon(
    aes(ymin = lwr, ymax = upr, fill = Status, color = NULL),
    alpha = 0.5
  ) + 
  geom_line() +
  facet_wrap(~tooth_name, nrow = 4) +
  theme_bw() +
  ylab('proportion of teeth')
**Figure 1a.** *Tooth status as a function of age for molars.* Lines represents a linear interpolation of 6-year age buckets with the shaded areas providing (pointwise) 95% confidence intervals.

Figure 1a. Tooth status as a function of age for molars. Lines represents a linear interpolation of 6-year age buckets with the shaded areas providing (pointwise) 95% confidence intervals.

Cuspids

cap1b = sprintf(cap_template, "1b", "cuspids and bicuspids")
q3_clean %>%
  filter(grepl('cuspid', tooth_name)) %>%
  ggplot(aes(x = age, y = p, group = Status, color = Status) )  +
  geom_ribbon(
    aes(ymin = lwr, ymax = upr, fill = Status, color = NULL),
    alpha = 0.5
  ) + 
  geom_line() +
  facet_wrap(~tooth_name, nrow = 4) +
  theme_bw() +
  ylab('% of teeth')
**Figure 1b.** *Tooth status as a function of age for cuspids and bicuspids.* Lines represents a linear interpolation of 6-year age buckets with the shaded areas providing (pointwise) 95% confidence intervals.

Figure 1b. Tooth status as a function of age for cuspids and bicuspids. Lines represents a linear interpolation of 6-year age buckets with the shaded areas providing (pointwise) 95% confidence intervals.

Incisors

cap1c = sprintf(cap_template, "1c", "incisors")
q3_clean %>%
  filter(grepl('incisor', tooth_name)) %>%
  ggplot(aes(x = age, y = p, group = Status, color = Status) )  +
  geom_ribbon(
    aes(ymin = lwr, ymax = upr, fill = Status, color = NULL),
    alpha = 0.5
  ) + 
  geom_line() +
  facet_wrap(~tooth_name, nrow = 4) +
  theme_bw() +
  ylab('% of teeth')
**Figure 1c.** *Tooth status as a function of age for incisors.* Lines represents a linear interpolation of 6-year age buckets with the shaded areas providing (pointwise) 95% confidence intervals.

Figure 1c. Tooth status as a function of age for incisors. Lines represents a linear interpolation of 6-year age buckets with the shaded areas providing (pointwise) 95% confidence intervals.