#load needed packages. make sure they are installed.
library(here) #for data loading/saving
library(dplyr)
library(skimr)
library(ggplot2)
library(tidyverse)
library(tidymodels)
library(performance)
Model Fitting
In this section, we will explore the following tasks: 1. Loads cleaned data. 2. Fits a linear model to the continuous outcome (Body temperature) using only the main predictor of interest. 3. Fits another linear model to the continuous outcome using all (important) predictors of interest. 4. Compares the model results for the model with just the main predictor and all predictors. 5. Fits a logistic model to the categorical outcome (Nausea) using only the main predictor of interest. 6. Fits another logistic model to the categorical outcome using all (important) predictors of interest. 7. Compares the model results for the categorical model with just the main predictor and all predictors.
Setup
Loading data
#Path to data.
<- here::here("fluanalysis", "data", "cleaned_data.rds")
data_location #load data
<- readRDS(data_location) mydata
This sets up the linear model
<- linear_reg() %>% set_engine("lm") lm_mod
Fitting a linear model to the continuous outcome (Body temp) using only the main predictor of interest (RunnyNose)
<- lm_mod %>% fit(BodyTemp ~ RunnyNose, data = mydata)
lm_fit1 tidy(lm_fit1)
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 99.1 0.0819 1210. 0
2 RunnyNoseYes -0.293 0.0971 -3.01 0.00268
Fitting a linear model to the continuous outcome (Body temp) using all other predictors of interest
<- lm_mod %>% fit(BodyTemp ~ ., data = mydata)
lm_fit2 tidy(lm_fit2)
# A tibble: 38 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 97.9 0.304 322. 0
2 SwollenLymphNodesYes -0.165 0.0920 -1.80 0.0727
3 ChestCongestionYes 0.0873 0.0975 0.895 0.371
4 ChillsSweatsYes 0.201 0.127 1.58 0.114
5 NasalCongestionYes -0.216 0.114 -1.90 0.0584
6 CoughYNYes 0.314 0.241 1.30 0.193
7 SneezeYes -0.362 0.0983 -3.68 0.000249
8 FatigueYes 0.265 0.161 1.65 0.0996
9 SubjectiveFeverYes 0.437 0.103 4.22 0.0000271
10 HeadacheYes 0.0115 0.125 0.0913 0.927
# … with 28 more rows
Comparing the above linear models: lm_fit1 which is a reduced model and lm_fit2 which is a full model
anova(lm_fit1$fit, lm_fit2$fit)
Analysis of Variance Table
Model 1: BodyTemp ~ RunnyNose
Model 2: BodyTemp ~ SwollenLymphNodes + ChestCongestion + ChillsSweats +
NasalCongestion + CoughYN + Sneeze + Fatigue + SubjectiveFever +
Headache + Weakness + WeaknessYN + CoughIntensity + CoughYN2 +
Myalgia + MyalgiaYN + RunnyNose + AbPain + ChestPain + Diarrhea +
EyePn + Insomnia + ItchyEye + Nausea + EarPn + Hearing +
Pharyngitis + Breathless + ToothPn + Vision + Vomit + Wheeze
Res.Df RSS Df Sum of Sq F Pr(>F)
1 728 1030.53
2 695 909.12 33 121.41 2.8126 4.811e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(lm_fit1) %>%
::select(adj.r.squared, sigma, AIC, BIC, p.value) dplyr
# A tibble: 1 × 5
adj.r.squared sigma AIC BIC p.value
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.0110 1.19 2329. 2343. 0.00268
glance(lm_fit2) %>%
::select(adj.r.squared, sigma, AIC, BIC, p.value) dplyr
# A tibble: 1 × 5
adj.r.squared sigma AIC BIC p.value
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.0860 1.14 2304. 2469. 0.0000000420
# Another way to compare these models
compare_performance(lm_fit1, lm_fit2)
# Comparison of Model Performance Indices
Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------------------------------------------
lm_fit1 | _lm | 2329.3 (<.001) | 2329.4 (<.001) | 2343.1 (>.999) | 0.012 | 0.011 | 1.188 | 1.190
lm_fit2 | _lm | 2303.8 (>.999) | 2307.7 (>.999) | 2469.2 (<.001) | 0.129 | 0.086 | 1.116 | 1.144
This sets up logistics regression
<- logistic_reg() %>%
log_mod set_engine("glm")
Fitting logistic model to the categorical outcome (Nausea) using only the main predictor of interest (RunnyNose)
<- log_mod %>% fit(Nausea ~ RunnyNose, data = mydata)
log_fit1 tidy(log_fit1)
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.658 0.145 -4.53 0.00000589
2 RunnyNoseYes 0.0502 0.172 0.292 0.770
Fitting logistic model to the categorical outcome (Nausea) using all other predictors of interest
<- log_mod %>% fit(Nausea ~ ., data = mydata)
log_fit2 tidy(log_fit2)
# A tibble: 38 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.223 7.83 0.0285 0.977
2 SwollenLymphNodesYes -0.251 0.196 -1.28 0.200
3 ChestCongestionYes 0.276 0.213 1.30 0.195
4 ChillsSweatsYes 0.274 0.288 0.952 0.341
5 NasalCongestionYes 0.426 0.255 1.67 0.0944
6 CoughYNYes -0.140 0.519 -0.271 0.787
7 SneezeYes 0.177 0.210 0.840 0.401
8 FatigueYes 0.229 0.372 0.616 0.538
9 SubjectiveFeverYes 0.278 0.225 1.23 0.218
10 HeadacheYes 0.331 0.285 1.16 0.245
# … with 28 more rows
Comparing the above logistic models: log_fit1 which is a reduced model and log_fit2 which is a full model
anova(log_fit1$fit, log_fit2$fit)
Analysis of Deviance Table
Model 1: Nausea ~ RunnyNose
Model 2: Nausea ~ SwollenLymphNodes + ChestCongestion + ChillsSweats +
NasalCongestion + CoughYN + Sneeze + Fatigue + SubjectiveFever +
Headache + Weakness + WeaknessYN + CoughIntensity + CoughYN2 +
Myalgia + MyalgiaYN + RunnyNose + AbPain + ChestPain + Diarrhea +
EyePn + Insomnia + ItchyEye + EarPn + Hearing + Pharyngitis +
Breathless + ToothPn + Vision + Vomit + Wheeze + BodyTemp
Resid. Df Resid. Dev Df Deviance
1 728 944.57
2 695 751.47 33 193.1
compare_performance(log_fit1, log_fit2)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# Comparison of Model Performance Indices
Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | Score_spherical | PCP
----------------------------------------------------------------------------------------------------------------------------------------------
log_fit1 | _glm | 948.6 (<.001) | 948.6 (<.001) | 957.8 (>.999) | 1.169e-04 | 0.477 | 1.139 | 0.647 | -107.871 | 0.012 | 0.545
log_fit2 | _glm | 821.5 (>.999) | 825.1 (>.999) | 982.2 (<.001) | 0.247 | 0.414 | 1.040 | 0.515 | -Inf | 0.002 | 0.658