The nanostringr
R package is a companion R package to
the manuscript:
A. Talhouk, S. Kommoss, R. Mackenzie, M. Cheung, S. Leung, D. Chiu, S. Kalloger, D. Huntsman, S. Chen, M. Intermaggio, J. Gronwald, F. Chan, S. Ramus, C. Steidl, D. Scott, M. Anglesio. (2016). Single-patient molecular testing with NanoString nCounter Data using a reference-based strategy for batch effect correction. PLos ONE.
This vignette provides a guide to reproduce the analyses in the paper and document the use of some of the functions.
Included in this package are several datasets that are described in detail in the manuscript and that are annotated in expQC, the annotation data frame for all of the experiments, run in different CodeSets, including:
The NanoStringQC function computes several QC metrics and appends them to the annotation matrix. Care should be taken to ensure that the data is in the proper format. The function returns different QC metric flags that can be used to filter out samples that fail QC.
library(nanostringr)
<- NanoStringQC(ovd.r, subset(expQC, OVD == "Yes"))
expOVD <- NanoStringQC(ovo.r, subset(expQC, OVO == "Yes"))
expOVO <- NanoStringQC(ovc.r, subset(expQC, OVCL == "Yes"))
expOVCL <- NanoStringQC(hld.r, subset(expQC, HLD == "Yes"))
expHLD <- NanoStringQC(hlo.r, subset(expQC, HLO == "Yes"))
expHLO <- rbind(expHLD, expOVD, expHLO, expOVO, expOVCL)
expQC $cohort <- factor(
expQCx = c(
rep("HLD", nrow(expHLD)),
rep("OVD", nrow(expOVD)),
rep("HLO", nrow(expHLO)),
rep("OVO", nrow(expOVO)),
rep("OVCL", nrow(expOVCL))
),levels = c("HLD", "OVD", "OVCL", "HLO", "OVO"),
labels = c("HL", "OC", "OVCL", "HLO", "OVO")
)
boxplot(perFOV ~ cohort, ylab = "% FOV", main = "% FOV by Cohort", data = expQC, pch = 20,
col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO))
abline(h = 75, lty = 2, col = "red")
grid(NULL, NULL, lwd = 1)
boxplot(linPC ~ cohort, ylab = expression(R ^ 2), main = "Linearity of Positive Controls by Cohort",
data = expQC, pch = 20, col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO), ylim = c(0, 1))
abline(h = 0.95, lty = 2, col = "red")
grid(NULL, NULL, lwd = 1)
boxplot(averageHK ~ cohort, ylab = "Average log HK expression",
main = "Average log expression of Housekeeping genes by Cohort", data = expQC, pch = 20,
col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO))
abline(h = 50, lty = 2, col = "red")
grid(NULL, NULL, lwd = 1)
boxplot(lod ~ cohort, ylab = "LOD", main = "Limit of detection (LOD) by Cohort",
data = expQC, pch = 20, col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO))
abline(h = 50, lty = 2, col = "red")
grid(NULL, NULL, lwd = 1)
boxplot(pergd ~ cohort, data = expQC, border = "white",
ylab = "% Genes Detected",
main = "Percent of Genes Detected Above \n the Limit of Detection",
pch = 20, col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO))
abline(h = 50, lty = 2, col = "red")
grid(NULL, NULL, lwd = 1)
stripchart(pergd ~ cohort, data = expQC,
vertical = TRUE, method = "jitter",
pch = 20, cex = 0.4 , col = "#3A6EE3",
add = TRUE)
<- 100
sn <- 60
detect
plot(expOVD$sn, expOVD$pergd, pch = 20, col = COL.OVD, xaxt = "n", ylim = c(0, 100), xlim = range(expOVD$sn),
xlab = "Signal to Noise Ratio", ylab = "% Genes Detected")
points(expOVO$sn, expOVO$pergd, pch = 20, col = COL.OVO)
points(expOVCL$sn, expOVCL$pergd, pch = 20, col = COL.OVCL)
points(expHLD$sn, expHLD$pergd, pch = 20, col = COL.HLD)
points(expHLO$sn, expHLO$pergd, pch = 20, col = COL.HLO)
axis(1, at = seq(0, max(expQC$sn) + 1, 300))
abline(v = sn, col = "red", lwd = 2)
abline(h = detect, lty = 2)
title("Signal to Noise vs \n Ratio of Genes Detected")
legend("bottomright", c("HL", "OC", "OVCL", "HLO", "OVO"), pch = 20, bty = 'n',
col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO))
plot(expOVD$sn, expOVD$pergd, pch = 20, col = COL.OVD, xaxt = "n", ylim = c(0, 100), xlim = c(0, 6000),
xlab = "Signal to Noise Ratio ", ylab = "Ratio of Genes Detected")
points(expOVO$sn, expOVO$pergd, pch = 20, col = COL.OVO)
points(expOVCL$sn, expOVCL$pergd, pch = 20, col = COL.OVCL)
points(expHLD$sn, expHLD$pergd, pch = 20, col = COL.HLD)
points(expHLO$sn, expHLO$pergd, pch = 20, col = COL.HLO)
axis(1, at = seq(0, max(expQC$sn) + 1, 300))
abline(v = sn, col = "red", lwd = 2)
abline(h = detect, lty = 2)
title("Signal to Noise vs \n Ratio of Genes Detected (Zooming-in)")
legend("bottomright", c("HL", "OC", "OVCL", "HLO", "OVO"), pch = 20, bty = 'n',
col = c(COL.HLD, COL.OVD, COL.OVCL, COL.HLO, COL.OVO))
The HKnorm function does a simple normalization to log transformed (base 2) gene expression data data to housekeeping genes. This is done by subtracting the average log housekeeping gene expression level from the expression level of every gene in each sample.
As an example, here we check the QC metrics of the Hodgkin Lymphoma data and normalize it to housekeeping genes:
We can check to see if any samples failed QC metrics
<- expHLD
expHLD0 any(expHLD0$QCFlag == "Failed")
[1] TRUE
$sampleID[which(expHLD0$QCFlag == "Failed")] expHLD0
[1] “HL1_3” “HL1_13” “HL1_17” “HL1_18” “HL2_3”
Since these are matched samples we must remove both pairs from the annotation data frame and from the gene expression data frame.
<- dplyr::filter(expHLD0, sampleID != "HL1_18" & sampleID != "HL2_18")
expHLD <- hld.r[, !colnames(hld.r) %in% c("HL1_18", "HL2_18")] hld
We now normalize the resulting gene expression data
# If data already log normalized
<- HKnorm(hld, is.logged = TRUE)
hld.n
# Otherwise, normalize to HK
<- HKnorm(hld)
hld.n <- hld.n[, grep("HL1", colnames(hld.n))]
hld1 <- subset(expHLD, geneRLF == "HL1")
exp.hld1
<- hld.n[, grep("HL2", colnames(hld.n))]
hld2 <- subset(expHLD, geneRLF == "HL2") exp.hld2
This function does batch adjustment using a reference-based strategy.
Below is how this would work for the HL data:
<- 3 # The number of references to use
r <- exp.hld1$sampleID[sample((1:dim(exp.hld1)[1]), r, replace = F)] # select reference samples randomly
choice.refs <- t(hld1[, choice.refs])
R1 <- t(hld2[, paste("HL2", getNum(choice.refs), sep = "_")])
R2 <- t(hld2[, !colnames(hld2) %in% paste("HL2", getNum(choice.refs), sep = "_")])
Y <- t(refMethod(Y, R1, R2)) # Data from CodeSet 2 now calibrated for CodeSet 1 S2.r
We can check the result by selecting a random gene and plotting the expression values from both CodeSets
set.seed(2016)
<- sample(1:nrow(hld1), 1)
gene par(mfrow = c(1, 2))
plot(t(hld1[gene, ]), t(hld2[gene, ]), xlab = "HL1", ylab = "HL2",
main = "No Correction")
abline(0, 1)
plot(t(hld1[gene, !(colnames(hld1) %in% choice.refs)]), t(S2.r[gene, ]),
xlab = "HL1", ylab = "HL2", main = "Corrected")
abline(0, 1)