Exercise 25. Localised melanoma : generating and analysing a nested case-control (NCC) study


library(biostat3) # cox and conditional logistic analyses
library(Epi)      # sample a nested case-control study from a cohort
## Get the data for exercise 25 and have a look at it
data(melanoma)
mel <- subset(melanoma, stage=="Localised")           # restrict the cohort to stage==1
mel <- transform(mel,
                 dc = (mel$status=="Dead: cancer" & surv_mm<120)+0,
                 surv_10y = pmin(120, surv_mm))
table(mel$dc , mel$status)                            # check
str(mel$sex) ; str(mel$year8594) ; str(mel$agegrp)    #Check structure of risk factors/confounders
##  Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 1 2 2 ...
##  Factor w/ 2 levels "Diagnosed 75-84",..: 1 1 1 1 1 1 1 1 1 1 ...
##  Factor w/ 4 levels "0-44","45-59",..: 4 4 4 4 4 4 4 4 4 4 ...
out_coh <- coxph(Surv(surv_10y,dc) ~ sex + year8594 + agegrp, data = mel)
summary(out_coh)
## Call:
## coxph(formula = Surv(surv_10y, dc) ~ sex + year8594 + agegrp, 
##     data = mel)
## 
##   n= 5318, number of events= 960 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## sexFemale               -0.53061   0.58825  0.06545 -8.107 5.55e-16 ***
## year8594Diagnosed 85-94 -0.33339   0.71649  0.06618 -5.037 4.72e-07 ***
## agegrp45-59              0.28283   1.32688  0.09417  3.003  0.00267 ** 
## agegrp60-74              0.62006   1.85904  0.09088  6.823 8.90e-12 ***
## agegrp75+                1.21801   3.38045  0.10443 11.663  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sexFemale                  0.5882     1.7000    0.5174    0.6688
## year8594Diagnosed 85-94    0.7165     1.3957    0.6293    0.8157
## agegrp45-59                1.3269     0.7536    1.1032    1.5959
## agegrp60-74                1.8590     0.5379    1.5557    2.2215
## agegrp75+                  3.3804     0.2958    2.7547    4.1483
## 
## Concordance= 0.646  (se = 0.01 )
## Rsquare= 0.039   (max possible= 0.949 )
## Likelihood ratio test= 212.7  on 5 df,   p=0
## Wald test            = 217.9  on 5 df,   p=0
## Score (logrank) test = 226.8  on 5 df,   p=0

(a)

There are 5318 individuals in the study that we would need to collect data for if we were to use the complete cohort of patients.

n_ind <- length(mel$id)
n_ind
## [1] 5318

(b)

960 cancer patients die from melanoma during the first 10 years of follow-up.

table(mel$dc, useNA="always")
## 
##    0    1 <NA> 
## 4358  960    0
ncase <-  table (mel$dc, useNA="always")[2]
ncase
##   1 
## 960

(c1)

nccdata <-ccwc( entry=0, exit=surv_10y , fail=dc, origin=0, controls=1, include=list(sex,year8594,agegrp,dc,id), data=mel )
## 
## Sampling risk sets: .....................................................................................................................
tail(nccdata, 8)
##      Set  Map  Time Fail    sex        year8594 agegrp dc   id
## 1913 114 2674  87.5    1 Female Diagnosed 85-94   0-44  1 3784
## 1914 114 2425  87.5    0   Male Diagnosed 85-94  45-59  0 3405
## 1915 115 4275  98.5    1   Male Diagnosed 85-94  60-74  1 6168
## 1916 115 2379  98.5    0   Male Diagnosed 85-94  60-74  0 3335
## 1917 116 5287   2.5    1   Male Diagnosed 85-94  45-59  1 7713
## 1918 116 5184   2.5    0   Male Diagnosed 85-94  45-59  0 7555
## 1919 117 5314 119.5    1   Male Diagnosed 85-94  60-74  1 7753
## 1920 117 1050 119.5    0   Male Diagnosed 75-84   0-44  0 1482

(c2)

out_ncc <- clogit(Fail ~ sex + year8594 + agegrp + strata(Set), data=nccdata)
summary(out_ncc)
## Call:
## coxph(formula = Surv(rep(1, 1920L), Fail) ~ sex + year8594 + 
##     agegrp + strata(Set), data = nccdata, method = "exact")
## 
##   n= 1920, number of events= 960 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## sexFemale               -0.60257   0.54740  0.09631 -6.256 3.94e-10 ***
## year8594Diagnosed 85-94 -0.44308   0.64205  0.09770 -4.535 5.76e-06 ***
## agegrp45-59              0.30355   1.35466  0.12615  2.406   0.0161 *  
## agegrp60-74              0.65312   1.92153  0.12573  5.194 2.05e-07 ***
## agegrp75+                1.36296   3.90772  0.16580  8.221 2.22e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sexFemale                  0.5474     1.8268    0.4532    0.6611
## year8594Diagnosed 85-94    0.6421     1.5575    0.5302    0.7776
## agegrp45-59                1.3547     0.7382    1.0579    1.7346
## agegrp60-74                1.9215     0.5204    1.5018    2.4585
## agegrp75+                  3.9077     0.2559    2.8236    5.4082
## 
## Rsquare= 0.06   (max possible= 0.701 )
## Likelihood ratio test= 119.2  on 5 df,   p=0
## Wald test            = 108.9  on 5 df,   p=0
## Score (logrank) test = 115.7  on 5 df,   p=0

(d)

n_uni <- length(unique(nccdata$id))
n_uni
## [1] 1714

(e)

Note that, since every nested case-control study is different, the parameter estimates you obtain will not be identical to those above. However, the hazard ratios from the two models should be very similar. The standard errors are slightly larger for the nested case-control study since the estimates are based on a sample from the full cohort. Loss of precision is the trade-off we have to make when designing a nested case-control study. The precision can be improved by adding more controls to each case.

comp <- exp(data.frame(coef(out_coh),coef(out_ncc)))
colnames(comp) <- c("cohort HR", "NCC HR")
comp                           # print the HR estimates from the cohort and the NCC
##                         cohort HR    NCC HR
## sexFemale               0.5882470 0.5474009
## year8594Diagnosed 85-94 0.7164882 0.6420532
## agegrp45-59             1.3268817 1.3546591
## agegrp60-74             1.8590433 1.9215327
## agegrp75+               3.3804475 3.9077244
var <- data.frame(diag(vcov(out_coh)), diag(vcov(out_ncc)), diag(vcov(out_coh))/diag(vcov(out_ncc)))
colnames(var) <- c("cohort var", "NCC var", "ratio coh/ncc" )
var                            # print the variances of estimates from the cohort and the NCC and their ratio
##                          cohort var     NCC var ratio coh/ncc
## sexFemale               0.004283749 0.009276456     0.4617872
## year8594Diagnosed 85-94 0.004380163 0.009545333     0.4588801
## agegrp45-59             0.008868630 0.015913788     0.5572922
## agegrp60-74             0.008258394 0.015809093     0.5223825
## agegrp75+               0.010906329 0.027488474     0.3967601

(f)

We see that there is sampling variation in the parameter estimates from the five nested case-control studies but they are centered on the full cohort estimate. We see that the standard errors of the estimates from the nested case-control studies are larger than for the full cohort but there is some sampling variation.

M <- 20                   # Number of loops: change the M value to change the number of loops
param   <- matrix(0,M,5)  # Define the matrice of the coefficients
for (i in 1:M)  {         # Start of the loop, create NCC data and analyse it
    nccdata <-ccwc( entry=0, exit=surv_10y , fail=dc, origin=0, controls=1, include=list(sex,year8594,agegrp), data=mel )
    out_ncc <- clogit(Fail ~ sex + year8594 + agegrp + strata(Set), data=nccdata)
    param [ i , 1:5] <- coef(out_ncc)   # store the 5 coefficients M times
}                  # End of the loop
param <- as.data.frame(param)        # data frame with the coefficients
param <- exp(param)                  # data frame with the HR
param <- rbind (summary(out_coh)$coef[c(1:5),2], param)
colnames(param) <- names(coef(out_coh))
rownames(param) <- c("cohort", c(1:M))
param
##        sexFemale year8594Diagnosed 85-94 agegrp45-59 agegrp60-74 agegrp75+
## cohort 0.5882470               0.7164882    1.326882    1.859043  3.380448
## 1      0.6167863               0.7413962    1.373123    2.151342  3.398660
## 2      0.5653483               0.6344868    1.416727    1.909224  3.533715
## 3      0.6021715               0.6717362    1.353716    1.918158  2.990460
## 4      0.6166861               0.7033602    1.523220    1.900843  3.579560
## 5      0.6531324               0.6759047    1.259635    2.069813  3.138128
## 6      0.5280217               0.7433057    1.264436    1.765478  3.621152
## 7      0.6019036               0.6674855    1.245936    1.788824  4.444367
## 8      0.5594090               0.6633932    1.299240    1.716498  3.669968
## 9      0.5186639               0.6798896    1.299618    1.669272  3.472052
## 10     0.6040502               0.7788994    1.310728    1.851865  3.466284
## 11     0.5718648               0.6498968    1.389561    1.832362  3.730724
## 12     0.5550892               0.7857916    1.287573    1.793193  3.518383
## 13     0.5451861               0.7219160    1.375814    1.949375  4.154806
## 14     0.5365412               0.6863693    1.167169    1.921206  3.716960
## 15     0.5572231               0.6521943    1.289318    1.674674  3.070881
## 16     0.5605417               0.7201568    1.261123    1.619856  2.823109
## 17     0.6130899               0.6929343    1.409247    1.994453  3.788684
## 18     0.5252033               0.6783916    1.261320    1.838816  3.090235
## 19     0.5667135               0.6737309    1.238296    1.561176  3.079474
## 20     0.5468751               0.7317959    1.421108    2.230440  4.866750
mean_param <- apply(param[c(2:M),], 2, mean)  # compute the mean of the HR for the M loops
sd_param <-  apply(param[c(2:M),], 2, sd)     # compute the sd of the HR for the M loops
par_sum <- rbind (summary(out_coh)$coef[c(1:5),2], mean_param, sd_param)
rownames(par_sum) <- c("cohort HR","mean HR NCC","sd HR NCC")
colnames(par_sum) <- names(coef(out_coh))
par_sum
##              sexFemale year8594Diagnosed 85-94 agegrp45-59 agegrp60-74
## cohort HR   0.58824704              0.71648818  1.32688166   1.8590433
## mean HR NCC 0.57355926              0.69585469  1.31714735   1.8382330
## sd HR NCC   0.03717117              0.04243116  0.08248321   0.1518786
##             agegrp75+
## cohort HR   3.3804475
## mean HR NCC 3.4888212
## sd HR NCC   0.4043571
par(mfrow=c(2,3) )                                   # allow 2*3 graphs on the same page
for (i in 1:5) {
    hist(param[,i], main=colnames(par_sum)[i], xlab="HR value")  # histogram of HR for variable sex (female vs male)
    abline(v=summary(out_coh)$coef[i,2], col="green")
    abline(v=mean_param[i], col="red")
}
plot.new()
legend(0, 0.5, c("Cox HR value","Mean NCC"),lty=1,col=c("green","red"),bty="n")