Add exercise and clean up code for example

2025-06-20 13:59:41 +02:00 · 2025-06-20 13:59:41 +02:00 · 40fdb67b67
commit 40fdb67b67
parent 72bd3fcaa7
6 changed files with 186 additions and 40 deletions
--- a/code/hsb.R
+++ b/code/hsb.R
@ -1,33 +1,49 @@
+# hsb.R
+#
+# content: (1) Read and plot data
+#          (2) Fit models with random school effects
+#          (3) Hierarchical modeling
+#
+# input: data/hsbdataset.txt
+# output: --
+#
+# last mod: 2025-06-20, NW
+
 library(lme4)
 library(lattice)

+#----- (1) Read and plot data -------------------------------------------------
 dat <- read.table("data/hsbdataset.txt", header = TRUE)

 dat$gmmath <- mean(dat$mathach)
 dat$meanmath <- with(dat, ave(mathach, school))

-plot(dat$ses - dat$meanses, dat$cses)
-
-
-xyplot(mathach ~ cses, dat)
-xyplot(mathach + meanmath + gmmath ~ cses | factor(school), dat,
-       type = c("p", "r", "r"), distribute.type = TRUE,
+xyplot(mathach + meanmath + gmmath ~ cses | factor(school), data = dat,
+       type = c("p", "r", "r"),
+       distribute.type = TRUE,
       col = c("#91C86E", "#91C86E", "#78004B"))

-xyplot(gmmath + meanmath ~ cses | factor(school), dat, type = "r")
+# Shorter version
+xyplot(gmmath + meanmath ~ cses | factor(school), data = dat, type = "r")

+#----- (2) Fit models with random school effects ------------------------------

+## Null model with school-specific random intercepts
 m1 <- lmer(mathach ~ 1 + (1 | school), dat)

+# Plot predictions
 xyplot(mathach + predict(m1) + predict(m1, re.form = NA) ~ cses | factor(school),
-       dat, type = c("p", "r", "r"), distribute.type = TRUE,
+       data = dat,
+       type = c("p", "r", "r"),
+       distribute.type = TRUE,
       col = c("#91C86E", "#91C86E", "#78004B"))

 # ICC
 VarCorr(m1)[[1]] / (VarCorr(m1)[[1]] + sigma(m1)^2)

-sjPlot::tab_model(m1)
-
+## Model with socioeconomic status and school-specific random intercepts
+xyplot(mathach ~ cses, dat)
+mean(dat$cses)

 m2 <- lmer(mathach ~ cses + (1 | school), dat)

@ -38,36 +54,41 @@ xyplot(mathach + predict(m2) + predict(m2, re.form = NA) ~ cses | factor(school)
 # ICC
 VarCorr(m2)[[1]] / (VarCorr(m2)[[1]] + sigma(m2)^2)

-sjPlot::tab_model(m1, m2)
-
-
+## Model with socioeconomic status and school-specific random slopes
 m3 <- lmer(mathach ~ cses + (cses | school), dat)

 xyplot(mathach + predict(m3) + predict(m3, re.form = NA) ~ cses | factor(school),
       dat, type = c("p", "r", "r"), distribute.type = TRUE,
       col = c("#91C86E", "#91C86E", "#78004B"))

-sjPlot::tab_model(m1, m2, m3)
-
-
+## Model with socioeconomic status, sector, and school-specific random slopes
 m4 <- lmer(mathach ~ cses + sector + (cses | school), data = dat)

-sjPlot::tab_model(m1, m2, m3, m4)
-
+## Model with socioeconomic status, sector, interaction, and school-specific
+## random slopes
 m5 <- lmer(mathach ~ cses * sector + (cses | school), data = dat)

-sjPlot::tab_model(m1, m2, m3, m4, m5)
-
 xyplot(mathach ~ cses, data = dat, groups = sector, type = c("p", "r"))

+#----- (3) Hierarchical modeling ----------------------------------------------
+
+h1 <- lmer(mathach ~ meanses*cses + sector*cses + (1 + cses | school),
+           data = dat, REML = FALSE)
+
+h2 <- lmer(mathach ~ meanses*cses + sector*cses + (1 | school),
+           data = dat, REML = FALSE)
+
+# Likelihood-ratio test
+anova(h2, h1)
+
+pm1 <- profile(h1)
+confint(pm1)
+xyplot(pm1)
+#densityplot(pm1)
+splom(pm1, which = "theta_")


-lmm.1 <- lmer(mathach ~ meanses*cses + sector*cses + (1 | school), data = dat,
-              REML = FALSE)
-
-lmm.2 <- lmer(mathach ~ meanses*cses + sector*cses + (1 + cses | school),
-              data = dat, REML = FALSE)
-
+## Visualization of two way interaction of `cses` and `meanses`
 c <- seq(-2, 2, length = 51)
 m <- seq(-1, 1, length = 26)
 ndat <- expand.grid(c, m)
@ -76,19 +97,16 @@ colnames(ndat) <- c("cses", "meanses")

 ndat$sector <- factor(0, levels = c("0", "1"))

-z <- matrix(predict(lmm.2, newdata=ndat, re.form=NA), 51)
+z <- matrix(predict(lmm.2, newdata = ndat, re.form = NA), 51)

 persp(c, m, z, theta = 40, phi = 20, col = "lightblue", ltheta = 60, shade = .9,
      xlab = "cses", ylab = "meanses", zlab = "mathach", main = "Model 2")

-lmm.3 <- lmer(mathach ~ meanses + sector*cses + (1 + cses | school),
-              data = dat, REML = FALSE)
+h3 <- lmer(mathach ~ meanses + sector*cses + (1 + cses | school),
+           data = dat, REML = FALSE)

 z <- matrix(predict(lmm.3, newdata = ndat, re.form = NA), nrow = 51)

 persp(c, m, z, theta = 40, phi = 20, col = "lightblue", ltheta = 60, shade = .9,
      xlab = "cses", ylab = "meanses", zlab = "mathach", main = "Model 3")

-
-# TODO: Add profiling to show instability of parameter estimation?
-
--- a/exercises/jsp.md
+++ b/exercises/jsp.md
@ -0,0 +1,126 @@
+Exercise: Junior School Project
+================
+Nora Wickelmaier
+2025-06-20
+
+Load the Junior School Project collected from primary (U.S. term is
+elementary) schools in inner London in R. You might need to install the
+faraway package first with `install.packages("faraway")`.
+
+The data frame contains the following variables:
+
+|           |                                                                                                                                                              |
+|-----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `school`  | 50 schools code 1–50                                                                                                                                         |
+| `class`   | a factor with levels `1`, `2`, `3`, and `4`                                                                                                                  |
+| `gender`  | a factor with levels `boy` and `girl`                                                                                                                        |
+| `social`  | class of the father I = 1; II = 2; III nonmanual = 3; III manual = 4; IV = 5; V = 6; Long-term unemployed = 7; Not currently employed = 8; Father absent = 9 |
+| `raven`   | test score                                                                                                                                                   |
+| `id`      | student id coded 1–1402                                                                                                                                      |
+| `english` | score on English                                                                                                                                             |
+| `math`    | score on Maths                                                                                                                                               |
+| `year`    | year of school                                                                                                                                               |
+
+We want to investigate how math achievement is influenced by raven score
+and social class of the father. If you need a refresher on Raven’s
+Progressive Matrices, check here:
+<https://en.wikipedia.org/wiki/Raven%27s_Progressive_Matrices>.
+Basically, it is an intelligent test.
+
+We will take a subset of the data, so that each student provides only
+one data point, for simplicity:
+
+``` r
+data("jsp", package = "faraway")
+dat <- jsp |> subset(year == 0)
+```
+
+<img src="jsp_files/figure-gfm/unnamed-chunk-2-1.png" style="display: block; margin: auto;" />
+
+1.  Create a new variable `craven` where `raven` is centered over all
+    students
+
+2.  Create another variable `gcraven` where `craven` is centered over
+    all schools. Create a variable `mraven` containing the centered
+    average school means first, so that you can calculate
+
+``` r
+dat$gcraven <- dat$craven - dat$mraven
+
+# Check your results
+aggregate(craven ~ school, dat, mean) |> head()
+```
+
+    ##   school     craven
+    ## 1      1 -2.6886533
+    ## 2      2  0.1250722
+    ## 3      3  2.0584055
+    ## 4      4  0.9167389
+    ## 5      5  2.3512627
+    ## 6      6  0.4584055
+
+``` r
+aggregate(gcraven ~ school, dat, mean) |> head()
+```
+
+    ##   school       gcraven
+    ## 1      1  1.253746e-15
+    ## 2      2 -1.184075e-15
+    ## 3      3 -1.421172e-15
+    ## 4      4  1.184283e-15
+    ## 5      5  5.075722e-16
+    ## 6      6  0.000000e+00
+
+3.  Create a plot with `lattice::xyplot()` with `gcraven` on the
+    $x$-axis and `math` on the $y$-axis and one panel for each school.
+    Use `type = c("p",    "g", "r")`. You can also use `ggplot2` if you
+    want to. What would be your conclusion about the need for
+    school-specific slopes based on this plot?
+
+<img src="jsp_files/figure-gfm/unnamed-chunk-7-1.png" style="display: block; margin: auto;" />
+
+4.  We will consider the following levels of the data:
+
+    - Level 1: students
+    - Level 2: schools
+
+    And the variables associated with the levels:
+
+    | Level | Variable  | Description                                  |
+    |-------|-----------|----------------------------------------------|
+    | 2     | `school`  | 50 schools code 1–50                         |
+    | 2     | `mraven`  | mean raven score of school (overall mean 0)  |
+    | 1     | `social`  | class of the father (categorical)            |
+    | 1     | `gcraven` | centered test score (mean for each school 0) |
+    | 1     | `math`    | score on Maths                               |
+
+    Fit the following model containing school-specific intercepts and
+    slopes with `lme4::lmer()`
+
+    $$
+    \begin{align*}
+    \text{(Level 1)} \quad y_{ij} &= b_{0i} + b_{1i}\,gcraven_{ij} + b_{2i}\,social_{ij} + b_{3i}\,(gcraven_{ij}\times social_{ij}) + \varepsilon_{ij}\\
+    \text{(Level 2)} \quad b_{0i} &= \beta_0 + \beta_4\,mraven_i + \upsilon_{0i} \\
+                     \quad b_{1i} &= \beta_1 + \beta_5\,mraven_i + \upsilon_{1i}\\
+                     \quad b_{2i} &= \beta_2\\
+                     \quad b_{3i} &= \beta_3\\
+    \text{(2) in (1)} \quad y_{ij} &= \beta_{0} + \beta_{1}\,gcraven_{ij} + \beta_{2}\,social_{ij} + \beta_{3}(gcraven_{ij}\times social_{ij})\\
+                                &~~~ + \beta_{4}\,mraven_i + \beta_{5}\,(gcraven_{ij} \times mraven_{i})\\
+                                &~~~ + \upsilon_{0i} + \upsilon_{1i}\,gcraven_{ij} + \varepsilon_{ij}
+    \end{align*}
+    $$ with
+    $\boldsymbol\upsilon \sim N(\boldsymbol 0, \boldsymbol{\Sigma}_\upsilon)$
+    i.i.d., $\varepsilon_{ij} \sim N(0, \sigma^2)$ i.i.d.
+
+5.  Interpret the parameters of the model:
+
+    - How much does math score increases if the raven score for a
+      student increases by one point for the reference social class of
+      the father?
+    - How much does math score increases when the raven score per school
+      increases by one point for the reference social class of the
+      father?
+    - What is your conclusion about the interactions in the model. Are
+      they needed?
+    - Does the inclusion of `social` improve the model fit? How can we
+      test this?
--- a/exercises/jsp_files/figure-gfm/unnamed-chunk-2-1.png
+++ b/exercises/jsp_files/figure-gfm/unnamed-chunk-2-1.png
--- a/exercises/jsp_files/figure-gfm/unnamed-chunk-7-1.png
+++ b/exercises/jsp_files/figure-gfm/unnamed-chunk-7-1.png
--- a/slides/lead_lmm.pdf
+++ b/slides/lead_lmm.pdf
--- a/slides/lead_lmm.tex
+++ b/slides/lead_lmm.tex
@ -78,7 +78,9 @@
    \item I will explain the general concepts with the slides
    \item We will switch to R and use the lme4 package to fit the models
    \item You will use R to fit an extension of the model
-    \item We will discuss the results\\~\\
+    \item We will discuss the results
+    \item All the materials are here: \url{https://gitea.iwm-tuebingen.de/nwickelmaier/lead_lmm}
+      \\~\\
    \item[$\to$] Try to go along in R! Ask as many questions as possible, also
      the ones you usually do not dare to ask (because you are supposed to know
      them already or something\dots)
@ -162,7 +164,7 @@
      \begin{column}{.6\textwidth}
        Model equation
    \begin{align*}
-    \text{(Level 1)}  \quad y_{ij} &= b_{0i} + \varepsilon_{ij}\\
+    \text{(Level 1)} ~\quad y_{ij} &= b_{0i} + \varepsilon_{ij}\\
    \text{(Level 2)}  \quad b_{0i} &= \beta_0 + \upsilon_{0i}\\
    \text{(2) in (1)} \quad y_{ij} &= \beta_0 + \upsilon_{0i} + \varepsilon_{ij}
    \end{align*}
@ -236,7 +238,7 @@
 \end{frame}


-\begin{frame}{Regression with random school effects}
+\begin{frame}{Adding socioeconomic status as a predictor}
  \begin{itemize}
    \item How strong is the relationship between students' socioeconomic status
      and their math achievement on average?
@ -267,7 +269,7 @@
      \begin{column}{.6\textwidth}
        Model equation
    \begin{align*}
-    \text{(Level 1)}  \quad y_{ij} &= b_{0i} + b_{1i}\,x_{ij} + \varepsilon_{ij}\\
+    \text{(Level 1)} ~\quad y_{ij} &= b_{0i} + b_{1i}\,x_{ij} + \varepsilon_{ij}\\
    \text{(Level 2)}  \quad b_{0i} &= \beta_0 + \upsilon_{0i}\\
                      \quad b_{1i} &= \beta_1\\
    \text{(2) in (1)} \quad y_{ij} &= \beta_0 + \beta_1\,x_{ij} +
@ -322,7 +324,7 @@
        \end{itemize}\pause
      \item How can we interpret the random slopes for this model?\pause
      \item How do we add random slopes to a random intercept model using
-        \texttt{lme4::lmer()}?
+        \texttt{lme4::lmer()}?\pause
      \item Fit a model with random slopes for socioeconomic status in R
    \end{itemize}
  \end{block}
@ -381,12 +383,12 @@
 \begin{frame}{Hierarchical regression model}
  Model equation
  \begin{align*}
-    \text{(Level 1)}  \quad y_{ij} =&~b_{0i} + b_{1i}\,cses_{ij} + \varepsilon_{ij}\\
+    \text{(Level 1)} ~\quad y_{ij} =&~b_{0i} + b_{1i}\,cses_{ij} + \varepsilon_{ij}\\
    \text{(Level 2)}  \quad b_{0i} =&~\beta_0 + \beta_2 meanses_i + \beta_4 sector_i + \upsilon_{0i}\\
                      \quad b_{1i} =&~\beta_1 + \beta_3 meanses_i + \beta_5 sector_i + \upsilon_{1i}\\
    \text{(2) in (1)} \quad y_{ij} =&~\beta_0 + \beta_1\,cses_{ij} + \beta_2 meanses_i + \beta_4 sector_i\\
                                   & + \beta_3 (cses_{ij} \times meanses_i) + \beta_5 (cses_{ij} \times sector_i) \\
-                                   & + \upsilon_{0i} + cses_{ij}\upsilon_{1i} + \varepsilon_{ij}
+                                   & + \upsilon_{0i} + \upsilon_{1i}cses_{ij} + \varepsilon_{ij}
    \end{align*}
 with
 \begin{align*}
@ -424,7 +426,7 @@ with
      \item Compute the model in R using \texttt{lme4::lmer()}
        {\scriptsize
          \begin{align*}
-    \text{(Level 1)}  \quad y_{ij} =&~b_{0i} + b_{1i}\,cses_{ij} + \varepsilon_{ij}\\
+    \text{(Level 1)} ~\quad y_{ij} =&~b_{0i} + b_{1i}\,cses_{ij} + \varepsilon_{ij}\\
    \text{(Level 2)}  \quad b_{0i} =&~\beta_0 + \beta_2 meanses_i + \beta_4 sector_i + \upsilon_{0i}\\
                      \quad b_{1i} =&~\beta_1 + \beta_3 meanses_i + \beta_5 sector_i + \upsilon_{1i}\\
    \text{(2) in (1)} \quad y_{ij} =&~\beta_0 + \beta_1\,cses_{ij} + \beta_2 meanses_i + \beta_4 sector_i