This example reproduces some of the analyses in the lecture notes for the SEM model of health care utilization.

library(lavaan)      # model fitting
library(semPlot)     # path diagrams
library(dplyr)       # data manipulation

healthdat <- read.table("http://euclid.psych.yorku.ca/datavis/courses/factor/data/healthutil.txt")
names(healthdat)
## [1] "age"      "stress"   "esteem"   "marriage" "control"  "physical"
## [7] "mental"   "druguse"  "drvisits"

## Specify the model

=~ specifies measurement equations for the latent variables; ~ for regressions; ~~ for correlations or covariances

health.mod1 <- '
# latent variables
Self =~ esteem + marriage + control
Ill  =~ physical + mental
Util =~ druguse + drvisits
# regressions
Ill  ~ age + stress + Self
Util ~ age + stress + Ill
# correlations (phi)
Self ~~ age + stress
age  ~~ stress
'

## fit this model using lavaan::sem

health.sem1 <- lavaan::sem(health.mod1,
data=healthdat,
estimator="ML",
fixed.x=FALSE)

Quick look at fit measures. This model doesnâ€™t fit well by any criteria

fitMeasures(health.sem1, c("chisq", "df", "pvalue", "cfi", "rmsea"))
##   chisq      df  pvalue     cfi   rmsea
## 111.521  20.000   0.000   0.878   0.101

For more detailed output, use summary()

summary(health.sem1, standardized=TRUE, fit.measures=TRUE)
## lavaan 0.6-3 ended normally after 104 iterations
##
##   Optimization method                           NLMINB
##   Number of free parameters                         25
##
##   Number of observations                           445
##
##   Estimator                                         ML
##   Model Fit Test Statistic                     111.521
##   Degrees of freedom                                20
##   P-value (Chi-square)                           0.000
##
## Model test baseline model:
##
##   Minimum Function Test Statistic              784.615
##   Degrees of freedom                                36
##   P-value                                        0.000
##
## User model versus baseline model:
##
##   Comparative Fit Index (CFI)                    0.878
##   Tucker-Lewis Index (TLI)                       0.780
##
## Loglikelihood and Information Criteria:
##
##   Loglikelihood user model (H0)              -9120.139
##   Loglikelihood unrestricted model (H1)      -9064.378
##
##   Number of free parameters                         25
##   Akaike (AIC)                               18290.278
##   Bayesian (BIC)                             18392.730
##   Sample-size adjusted Bayesian (BIC)        18313.391
##
## Root Mean Square Error of Approximation:
##
##   RMSEA                                          0.101
##   90 Percent Confidence Interval          0.084  0.120
##   P-value RMSEA <= 0.05                          0.000
##
## Standardized Root Mean Square Residual:
##
##   SRMR                                           0.065
##
## Parameter Estimates:
##
##   Information                                 Expected
##   Information saturated (h1) model          Structured
##   Standard Errors                             Standard
##
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Self =~
##     esteem            1.000                               2.552    0.644
##     marriage          1.510    0.294    5.132    0.000    3.853    0.432
##     control           0.266    0.051    5.260    0.000    0.678    0.535
##   Ill =~
##     physical          1.000                               1.873    0.798
##     mental            1.450    0.128   11.326    0.000    2.717    0.646
##   Util =~
##     druguse           1.000                               5.198    0.569
##     drvisits          0.055    0.006    9.833    0.000    0.288    0.698
##
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Ill ~
##     age               0.119    0.046    2.582    0.010    0.063    0.141
##     stress            0.692    0.079    8.717    0.000    0.370    0.482
##     Self              0.201    0.055    3.691    0.000    0.274    0.274
##   Util ~
##     age               0.037    0.125    0.296    0.767    0.007    0.016
##     stress            0.482    0.264    1.823    0.068    0.093    0.121
##     Ill               2.451    0.302    8.108    0.000    0.883    0.883
##
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Self ~~
##     age              -0.578    0.359   -1.607    0.108   -0.226   -0.102
##     stress           -0.010    0.209   -0.048    0.961   -0.004   -0.003
##   age ~~
##     stress           -0.842    0.143   -5.886    0.000   -0.842   -0.291
##
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .esteem            9.201    1.351    6.808    0.000    9.201    0.586
##    .marriage         64.572    5.257   12.283    0.000   64.572    0.813
##    .control           1.145    0.116    9.879    0.000    1.145    0.713
##    .physical          2.000    0.282    7.093    0.000    2.000    0.363
##    .mental           10.290    0.871   11.808    0.000   10.290    0.582
##    .druguse          56.380    4.505   12.516    0.000   56.380    0.676
##    .drvisits          0.087    0.010    9.160    0.000    0.087    0.513
##     age               4.943    0.331   14.916    0.000    4.943    1.000
##     stress            1.701    0.114   14.916    0.000    1.701    1.000
##     Self              6.511    1.475    4.415    0.000    1.000    1.000
##    .Ill               2.528    0.350    7.233    0.000    0.721    0.721
##    .Util              3.044    2.633    1.156    0.248    0.113    0.113

### Examine residual correlations.

Look for the largest ones in absolute value

resid(health.sem1, type='cor')\$cov
##          esteem marrig contrl physcl mental drugus drvsts age    stress
## esteem    0.000
## marriage  0.020  0.000
## control   0.000 -0.029  0.000
## physical -0.054 -0.010  0.004  0.000
## mental    0.115  0.168  0.196 -0.017  0.000
## druguse  -0.133  0.028  0.001 -0.011  0.023  0.000
## drvisits -0.118 -0.032 -0.041  0.054 -0.070  0.000  0.000
## age       0.082 -0.054 -0.082  0.037 -0.073 -0.010  0.006  0.000
## stress   -0.100  0.112  0.066 -0.051  0.101  0.019 -0.012  0.000  0.000

### check modification indices

Just look for the largest ones and only for residual correlations

modindices(health.sem1, minimum.value=20, op="~~")
##         lhs op      rhs     mi    epc sepc.lv sepc.all sepc.nox
## 66 physical ~~ drvisits 35.327  0.278   0.278    0.666    0.666
## 70   mental ~~ drvisits 21.326 -0.341  -0.341   -0.360   -0.360

## Draw path diagrams

semPlot::semPaths draws reasonable path diagrams, with many, many options. I always have to play around with these to get something relatively pleasing.

semPaths(health.sem1,
what="std",
style="lisrel",
color=list(lat="pink", man="lightblue"),
nCharNodes=8,
sizeMan=8, sizeLat=10)

semPaths(health.sem1,
what="std",
color=list(lat="pink", man="lightblue"),
nCharNodes=0,
sizeMan=10, sizeLat=8,
edge.label.cex=1,
rotation=2,
residuals=FALSE,
layout="tree2", label.norm="OOOOO")

## New model: Add covariance between physical and drvisits

health.mod2 <- '
# latent variables
Self =~ esteem + marriage + control
Ill  =~ physical + mental
Util =~ druguse + drvisits
# regressions
Ill  ~ age + stress + Self
Util ~ age + stress + Ill
Self ~~ age + stress
# correlations (phi)
age ~~ stress
physical ~~ drvisits
'

# or, more simply: just add this covariance to Model 1
health.mod2 <- paste(health.mod1,
'physical ~~ drvisits
')
cat(health.mod2)
##
## # latent variables
##  Self =~ esteem + marriage + control
##  Ill  =~ physical + mental
##  Util =~ druguse + drvisits
## # regressions
##  Ill  ~ age + stress + Self
##  Util ~ age + stress + Ill
## # correlations (phi)
##  Self ~~ age + stress
##  age  ~~ stress
##  physical ~~ drvisits

## Fit model 2

health.sem2 <- lavaan::sem(health.mod2,
data=healthdat,
estimator="ML",
fixed.x=FALSE)

With one more parameter, this model fits better, even if there is still a lack of fit measured by the $$\chi^2$$ test

fitMeasures(health.sem2, c("chisq", "df", "pvalue", "cfi", "rmsea"))
##  chisq     df pvalue    cfi  rmsea
## 68.568 19.000  0.000  0.934  0.077
summary(health.sem2, standardized=TRUE, fit.measures=FALSE)
## lavaan 0.6-3 ended normally after 113 iterations
##
##   Optimization method                           NLMINB
##   Number of free parameters                         26
##
##   Number of observations                           445
##
##   Estimator                                         ML
##   Model Fit Test Statistic                      68.568
##   Degrees of freedom                                19
##   P-value (Chi-square)                           0.000
##
## Parameter Estimates:
##
##   Information                                 Expected
##   Information saturated (h1) model          Structured
##   Standard Errors                             Standard
##
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Self =~
##     esteem            1.000                               2.426    0.612
##     marriage          1.607    0.282    5.705    0.000    3.900    0.438
##     control           0.291    0.047    6.168    0.000    0.707    0.558
##   Ill =~
##     physical          1.000                               1.456    0.619
##     mental            2.402    0.251    9.555    0.000    3.496    0.832
##   Util =~
##     druguse           1.000                               5.394    0.591
##     drvisits          0.044    0.006    7.884    0.000    0.237    0.580
##
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Ill ~
##     age               0.077    0.035    2.230    0.026    0.053    0.118
##     stress            0.552    0.073    7.543    0.000    0.379    0.494
##     Self              0.279    0.054    5.164    0.000    0.466    0.466
##   Util ~
##     age               0.124    0.136    0.917    0.359    0.023    0.051
##     stress            1.102    0.289    3.811    0.000    0.204    0.266
##     Ill               2.413    0.343    7.026    0.000    0.651    0.651
##
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   Self ~~
##     age              -0.618    0.345   -1.789    0.074   -0.255   -0.115
##     stress            0.032    0.200    0.161    0.872    0.013    0.010
##   age ~~
##     stress           -0.842    0.143   -5.886    0.000   -0.842   -0.291
##  .physical ~~
##    .drvisits          0.274    0.040    6.855    0.000    0.274    0.447
##
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .esteem            9.824    1.127    8.719    0.000    9.824    0.625
##    .marriage         64.213    5.124   12.531    0.000   64.213    0.809
##    .control           1.105    0.109   10.154    0.000    1.105    0.688
##    .physical          3.413    0.298   11.458    0.000    3.413    0.617
##    .mental            5.444    1.126    4.836    0.000    5.444    0.308
##    .druguse          54.309    5.094   10.661    0.000   54.309    0.651
##    .drvisits          0.110    0.010   10.965    0.000    0.110    0.663
##     age               4.943    0.331   14.916    0.000    4.943    1.000
##     stress            1.701    0.114   14.916    0.000    1.701    1.000
##     Self              5.887    1.229    4.788    0.000    1.000    1.000
##    .Ill               1.201    0.218    5.518    0.000    0.567    0.567
##    .Util             10.299    3.346    3.078    0.002    0.354    0.354

Print the standardized solution, but just for the latent variables and regressions

standardizedSolution(health.sem2) %>%
filter(op != "~~") %>%
print(digits=3)
##     lhs op      rhs est.std     se      z   pvalue ci.lower ci.upper
## 1  Self =~   esteem  0.6121 0.0560 10.927 0.00e+00   0.5023    0.722
## 2  Self =~ marriage  0.4376 0.0554  7.899 2.89e-15   0.3290    0.546
## 3  Self =~  control  0.5582 0.0552 10.116 0.00e+00   0.4500    0.666
## 4   Ill =~ physical  0.6189 0.0403 15.345 0.00e+00   0.5399    0.698
## 5   Ill =~   mental  0.8318 0.0391 21.292 0.00e+00   0.7552    0.908
## 6  Util =~  druguse  0.5906 0.0475 12.422 0.00e+00   0.4974    0.684
## 7  Util =~ drvisits  0.5801 0.0470 12.347 0.00e+00   0.4881    0.672
## 8   Ill  ~      age  0.1182 0.0521  2.270 2.32e-02   0.0161    0.220
## 9   Ill  ~   stress  0.4943 0.0496  9.964 0.00e+00   0.3971    0.592
## 10  Ill  ~     Self  0.4657 0.0610  7.629 2.38e-14   0.3460    0.585
## 11 Util  ~      age  0.0513 0.0558  0.918 3.59e-01  -0.0582    0.161
## 12 Util  ~   stress  0.2665 0.0680  3.916 8.99e-05   0.1331    0.400
## 13 Util  ~      Ill  0.6513 0.0685  9.512 0.00e+00   0.5171    0.786

## Compare models

These models are nested, so the $$\chi^2$$ test for their difference indicates whether model 2 is a significant improvement.

anova(health.sem1, health.sem2)
## Chi Square Difference Test
##
##             Df   AIC   BIC   Chisq Chisq diff Df diff Pr(>Chisq)
## health.sem2 19 18249 18356  68.568
## health.sem1 20 18290 18393 111.521     42.953       1  5.607e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1`