FAQ and Gallery showing various tables possible with the {gtsummary} package.
library(gtsummary); library(gt); library(survival)
library(dplyr); library(stringr); library(purrr); library(forcats); library(tidyr)
Headers, Labels and Formatting
Adding and Modifying Statistics
How do I include a column for missing values of a grouping variable?
How do I summarize a continuous variable by one, two (or more) categorical variables?
How do I stratify a summary table by more than one variable?
How do I add a p-value for each group compared to a single reference group?
Statistical Tests
Headers, Labels and Formatting
Creating and Combining Tables
Adding and Modifying Statistics
Add a spanning header over the group columns for increased clarity,
and modify column headers. Using bold_labels()
formats the
labels as bold, but labels can also be italicized using
italicize_labels()
, or combined to format with both bold
and italics.
%>%
trial select(trt, age, grade) %>%
tbl_summary(
by = trt,
missing = "no",
statistic = all_continuous() ~ "{median} ({p25}, {p75})"
%>%
) modify_header(all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p)}%)") %>%
add_n() %>%
bold_labels() %>%
modify_spanning_header(all_stat_cols() ~ "**Chemotherapy Treatment**")
Characteristic | N | Chemotherapy Treatment | |
---|---|---|---|
Drug A N = 98 (49%)1 |
Drug B N = 102 (51%)1 |
||
Age | 189 | 46 (37, 59) | 48 (39, 56) |
Grade | 200 | ||
I | 35 (36%) | 33 (32%) | |
II | 32 (33%) | 36 (35%) | |
III | 31 (32%) | 33 (32%) | |
1 Median (IQR); n (%) |
Show continuous summary statistics on multiple lines. Levels are
italicized here using the italicize_levels()
function, but
the bold_levels()
function can be used instead to create
bold text, or both functions can be used together to get text that is
both bold and in italics.
%>%
trial select(trt, age, marker) %>%
tbl_summary(
by = trt,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c("{N_nonmiss}",
"{mean} ({sd})",
"{median} ({p25}, {p75})",
"{min}, {max}"),
missing = "no"
%>%
) italicize_levels()
Characteristic | Drug A, N = 98 | Drug B, N = 102 |
---|---|---|
Age | ||
N | 91 | 98 |
Mean (SD) | 47 (15) | 47 (14) |
Median (IQR) | 46 (37, 59) | 48 (39, 56) |
Range | 6, 78 | 9, 83 |
Marker Level (ng/mL) | ||
N | 92 | 98 |
Mean (SD) | 1.02 (0.89) | 0.82 (0.83) |
Median (IQR) | 0.84 (0.24, 1.57) | 0.52 (0.19, 1.20) |
Range | 0.00, 3.87 | 0.01, 3.64 |
Modify the function that formats the p-values, change variable labels, updating tumor response header, and add a correction for multiple testing.
%>%
trial select(response, age, grade) %>%
mutate(response = factor(response, labels = c("No Tumor Response", "Tumor Responded"))) %>%
tbl_summary(
by = response,
missing = "no",
label = list(age ~ "Patient Age", grade ~ "Tumor Grade")
%>%
) add_p(pvalue_fun = ~style_pvalue(.x, digits = 2)) %>%
add_q()
Characteristic | No Tumor Response, N = 1321 | Tumor Responded, N = 611 | p-value2 | q-value3 |
---|---|---|---|---|
Patient Age | 46 (36, 55) | 49 (43, 59) | 0.091 | 0.18 |
Tumor Grade | 0.93 | 0.93 | ||
I | 46 (35%) | 21 (34%) | ||
II | 44 (33%) | 19 (31%) | ||
III | 42 (32%) | 21 (34%) | ||
1 Median (IQR); n (%) | ||||
2 Wilcoxon rank sum test; Pearson's Chi-squared test | ||||
3 False discovery rate correction for multiple testing |
Include missing tumor response as column using
fct_explicit_na()
.
%>%
trial select(response, age, grade) %>%
mutate(
response = factor(response, labels = c("No Tumor Response", "Tumor Responded")) %>%
fct_explicit_na(na_level = "Missing Response Status")
%>%
) tbl_summary(
by = response,
label = list(age ~ "Patient Age", grade ~ "Tumor Grade")
)
Characteristic | No Tumor Response, N = 1321 | Tumor Responded, N = 611 | Missing Response Status, N = 71 |
---|---|---|---|
Patient Age | 46 (36, 55) | 49 (43, 59) | 52 (44, 57) |
Unknown | 7 | 3 | 1 |
Tumor Grade | |||
I | 46 (35%) | 21 (34%) | 1 (14%) |
II | 44 (33%) | 19 (31%) | 5 (71%) |
III | 42 (32%) | 21 (34%) | 1 (14%) |
1 Median (IQR); n (%) |
Report treatment differences between two groups. This is often needed in randomized trials. In this example, we report the difference in tumor response and marker level between two chemotherapy treatments.
%>%
trial select(response, marker, trt) %>%
tbl_summary(
by = trt,
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{p}%"),
missing = "no"
%>%
) add_difference() %>%
add_n() %>%
modify_header(all_stat_cols() ~ "**{level}**") %>%
modify_footnote(all_stat_cols() ~ NA)
Characteristic | N | Drug A | Drug B | Difference1 | 95% CI1,2 | p-value1 |
---|---|---|---|---|---|---|
Tumor Response | 193 | 29% | 34% | -4.2% | -18%, 9.9% | 0.6 |
Marker Level (ng/mL) | 190 | 1.02 (0.89) | 0.82 (0.83) | 0.20 | -0.05, 0.44 | 0.12 |
1 Two sample test for equality of proportions; Welch Two Sample t-test | ||||||
2 CI = Confidence Interval |
Paired t-test and McNemar’s test. The data is expected in a long format with 2 rows per participant.
# imagine that each patient received Drug A and Drug B (adding ID showing their paired measurements)
<-
trial_paired %>%
trial select(trt, marker, response) %>%
group_by(trt) %>%
mutate(id = row_number()) %>%
ungroup()
# you must first delete incomplete pairs from the data, then you can build the table
%>%
trial_paired # delete missing values
filter(complete.cases(.)) %>%
# keep IDs with both measurements
group_by(id) %>%
filter(n() == 2) %>%
ungroup() %>%
# summarize data
tbl_summary(by = trt, include = -id) %>%
add_p(test = list(marker ~ "paired.t.test",
~ "mcnemar.test"),
response group = id)
Characteristic | Drug A, N = 831 | Drug B, N = 831 | p-value2 |
---|---|---|---|
Marker Level (ng/mL) | 0.82 (0.22, 1.63) | 0.53 (0.18, 1.26) | 0.2 |
Tumor Response | 21 (25%) | 28 (34%) | 0.3 |
1 Median (IQR); n (%) | |||
2 Paired t-test; McNemar's Chi-squared test with continuity correction |
Include p-values comparing all groups to a single reference group.
# table summarizing data with no p-values
<- trial %>% select(grade, age, response)
small_trial <- small_trial %>%
t0 tbl_summary(by = grade, missing = "no") %>%
modify_header(all_stat_cols() ~ "**{level}**")
# table comparing grade I and II
<- small_trial %>%
t1 filter(grade %in% c("I", "II")) %>%
tbl_summary(by = grade, missing = "no") %>%
add_p() %>%
modify_header(p.value ~ md("**I vs. II**")) %>%
# hide summary stat columns
modify_column_hide(all_stat_cols())
# table comparing grade I and II
<- small_trial %>%
t2 filter(grade %in% c("I", "III")) %>%
tbl_summary(by = grade, missing = "no") %>%
add_p() %>%
modify_header(p.value ~ md("**I vs. III**")) %>%
# hide summary stat columns
modify_column_hide(all_stat_cols())
# merging the 3 tables together, and adding additional gt formatting
tbl_merge(list(t0, t1, t2)) %>%
modify_spanning_header(
list(
all_stat_cols() ~ "**Tumor Grade**",
starts_with("p.value") ~ "**p-values**"
) )
Characteristic | Tumor Grade | p-values | |||
---|---|---|---|---|---|
I1 | II1 | III1 | I vs. II2 | I vs. III2 | |
Age | 47 (37, 56) | 48 (37, 57) | 47 (38, 58) | 0.7 | 0.5 |
Tumor Response | 21 (31%) | 19 (30%) | 21 (33%) | >0.9 | 0.9 |
1 Median (IQR); n (%) | |||||
2 Wilcoxon rank sum test; Fisher's exact test |
Add 95% confidence interval around the mean as an additional column
%>%
trial select(age, marker) %>%
tbl_summary(statistic = all_continuous() ~ "{mean} ({sd})", missing = "no") %>%
modify_header(stat_0 ~ "**Mean (SD)**") %>%
add_ci()
Characteristic | Mean (SD)1 | 95% CI2 |
---|---|---|
Age | 47 (14) | 45, 49 |
Marker Level (ng/mL) | 0.92 (0.86) | 0.79, 1.0 |
1 Mean (SD) | ||
2 CI = Confidence Interval |
It’s often needed to summarize a continuous variable by one, two, or
more categorical variables. The example below shows a table summarizing
a continuous variable by two categorical variables. To summarize by more
than two categorical variables, use tbl_continuous
in
conjunction with tbl_strata
(see an example of
tbl_strata
here).
%>%
trial select(trt, grade, marker) %>%
tbl_continuous(variable = marker, by = trt) %>%
modify_spanning_header(all_stat_cols() ~ "**Treatment Assignment**")
Characteristic | Treatment Assignment | |
---|---|---|
Drug A, N = 981 | Drug B, N = 1021 | |
Grade | ||
I | 0.96 (0.24, 1.70) | 1.05 (0.29, 1.49) |
II | 0.66 (0.31, 1.23) | 0.21 (0.10, 0.94) |
III | 0.84 (0.17, 1.91) | 0.58 (0.35, 1.36) |
1 Marker Level (ng/mL): Median (IQR) |
Build a summary table stratified by more than one variable.
%>%
trial select(trt, grade, age, stage) %>%
mutate(grade = paste("Grade", grade)) %>%
tbl_strata(
strata = grade,
~.x %>%
tbl_summary(by = trt, missing = "no") %>%
modify_header(all_stat_cols() ~ "**{level}**")
)
Characteristic | Grade I | Grade II | Grade III | |||
---|---|---|---|---|---|---|
Drug A1 | Drug B1 | Drug A1 | Drug B1 | Drug A1 | Drug B1 | |
Age | 46 (36, 60) | 48 (42, 55) | 44 (31, 54) | 50 (43, 57) | 52 (42, 60) | 45 (36, 52) |
T Stage | ||||||
T1 | 8 (23%) | 9 (27%) | 14 (44%) | 9 (25%) | 6 (19%) | 7 (21%) |
T2 | 8 (23%) | 10 (30%) | 8 (25%) | 9 (25%) | 9 (29%) | 10 (30%) |
T3 | 11 (31%) | 7 (21%) | 5 (16%) | 6 (17%) | 6 (19%) | 8 (24%) |
T4 | 8 (23%) | 7 (21%) | 5 (16%) | 12 (33%) | 10 (32%) | 8 (24%) |
1 Median (IQR); n (%) |
Include number of observations and the number of events in a univariate regression table.
%>%
trial select(response, age, grade) %>%
tbl_uvregression(
method = glm,
y = response,
method.args = list(family = binomial),
exponentiate = TRUE
%>%
) add_nevent()
Characteristic | N | Event N | OR1 | 95% CI1 | p-value |
---|---|---|---|---|---|
Age | 183 | 58 | 1.02 | 1.00, 1.04 | 0.10 |
Grade | 193 | 61 | |||
I | — | — | |||
II | 0.95 | 0.45, 2.00 | 0.9 | ||
III | 1.10 | 0.52, 2.29 | 0.8 | ||
1 OR = Odds Ratio, CI = Confidence Interval |
Include two related models side-by-side with descriptive statistics. We also use the compact table theme that reduces cell padding and font size.
<- glm(response ~ trt + grade, trial, family = binomial) %>%
gt_r1 tbl_regression(exponentiate = TRUE)
<- coxph(Surv(ttdeath, death) ~ trt + grade, trial) %>%
gt_r2 tbl_regression(exponentiate = TRUE)
<- trial[c("trt", "grade")] %>%
gt_t1 tbl_summary(missing = "no") %>%
add_n() %>%
modify_header(stat_0 ~ "**n (%)**") %>%
modify_footnote(stat_0 ~ NA_character_)
theme_gtsummary_compact()
#> Setting theme `Compact`
tbl_merge(
list(gt_t1, gt_r1, gt_r2),
tab_spanner = c(NA_character_, "**Tumor Response**", "**Time to Death**")
)
Characteristic | N | n (%) | Tumor Response | Time to Death | ||||
---|---|---|---|---|---|---|---|---|
OR1 | 95% CI1 | p-value | HR1 | 95% CI1 | p-value | |||
Chemotherapy Treatment | 200 | |||||||
Drug A | 98 (49%) | — | — | — | — | |||
Drug B | 102 (51%) | 1.21 | 0.66, 2.24 | 0.5 | 1.25 | 0.86, 1.81 | 0.2 | |
Grade | 200 | |||||||
I | 68 (34%) | — | — | — | — | |||
II | 68 (34%) | 0.94 | 0.44, 1.98 | 0.9 | 1.28 | 0.80, 2.06 | 0.3 | |
III | 64 (32%) | 1.09 | 0.52, 2.27 | 0.8 | 1.69 | 1.07, 2.66 | 0.024 | |
1 OR = Odds Ratio, CI = Confidence Interval, HR = Hazard Ratio |
Include the number of events at each level of a categorical predictor.
%>%
trial select(ttdeath, death, stage, grade) %>%
tbl_uvregression(
method = coxph,
y = Surv(ttdeath, death),
exponentiate = TRUE,
hide_n = TRUE
%>%
) add_nevent(location = "level")
Characteristic | Event N | HR1 | 95% CI1 | p-value |
---|---|---|---|---|
T Stage | ||||
T1 | 24 | — | — | |
T2 | 27 | 1.18 | 0.68, 2.04 | 0.6 |
T3 | 22 | 1.23 | 0.69, 2.20 | 0.5 |
T4 | 39 | 2.48 | 1.49, 4.14 | <0.001 |
Grade | ||||
I | 33 | — | — | |
II | 36 | 1.28 | 0.80, 2.05 | 0.3 |
III | 43 | 1.69 | 1.07, 2.66 | 0.024 |
1 HR = Hazard Ratio, CI = Confidence Interval |
Regression model where the covariate remains the same, and the outcome changes.
%>%
trial select(age, marker, trt) %>%
tbl_uvregression(
method = lm,
x = trt,
show_single_row = "trt",
hide_n = TRUE
%>%
) modify_header(list(
~"**Model Outcome**",
label ~ "**Treatment Coef.**"
estimate %>%
)) modify_footnote(estimate ~ "Values larger than 0 indicate larger values in the Drug B group.")
Model Outcome | Treatment Coef.1 | 95% CI2 | p-value |
---|---|---|---|
Age | 0.44 | -3.7, 4.6 | 0.8 |
Marker Level (ng/mL) | -0.20 | -0.44, 0.05 | 0.12 |
1 Values larger than 0 indicate larger values in the Drug B group. | |||
2 CI = Confidence Interval |
Implement a custom tidier to report Wald confidence intervals. The
Wald confidence intervals are calculated using
confint.default()
.
<- function(x, exponentiate = FALSE, conf.level = 0.95, ...) {
my_tidy ::bind_cols(
dplyr::tidy(x, exponentiate = exponentiate, conf.int = FALSE),
broom# calculate the confidence intervals, and save them in a tibble
::confint.default(x) %>%
stats::as_tibble() %>%
tibble::set_names(c("conf.low", "conf.high")) )
rlang
}
lm(age ~ grade + marker, trial) %>%
tbl_regression(tidy_fun = my_tidy)
Characteristic | Beta | 95% CI1 | p-value |
---|---|---|---|
Grade | |||
I | — | — | |
II | 0.64 | -4.6, 5.9 | 0.8 |
III | 2.4 | -2.8, 7.6 | 0.4 |
Marker Level (ng/mL) | -0.04 | -2.6, 2.5 | >0.9 |
1 CI = Confidence Interval |
Use significance stars on estimates with low p-values.
%>%
trial select(ttdeath, death, stage, grade) %>%
tbl_uvregression(
method = coxph,
y = Surv(ttdeath, death),
exponentiate = TRUE,
%>%
) add_significance_stars()
Characteristic | N | HR1,2 | SE2 |
---|---|---|---|
T Stage | 200 | ||
T1 | — | — | |
T2 | 1.18 | 0.281 | |
T3 | 1.23 | 0.295 | |
T4 | 2.48*** | 0.260 | |
Grade | 200 | ||
I | — | — | |
II | 1.28 | 0.241 | |
III | 1.69* | 0.232 | |
1 *p<0.05; **p<0.01; ***p<0.001 | |||
2 HR = Hazard Ratio, SE = Standard Error |