output: prettydoc::html_pretty: theme: cayman highlight: github
sim1000G depends on two R packages stringr and hapsim. Stringr is included in CRAN and can be installed as usual. Hapsim, can be installed from its source package.
install.packages("stringr")
install.packages("hapsim_0.31.tar.gz",repos=NULL, source=T)The sim1000G package can then be installed:
install.packages("sim1000G_1.04.tar.gz",repos=NULL, source=T)The genetic map we use is from the Hapmap 2010 release, lifted over to GrCH37 coordinates. It was downloaded from:
ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/Before starting the simulator, a VCF file of the region of interest is needed. The VCF file is used to provide the haplotypes that will be used for the simulator.
For this example we use an unfiltered region from 1000 genomes Phase III sequencing data VCF, chromosome 4, CEU samples.
We also need to initialize all the simulator internal structures with the command startSimulation.
The following parameters can be set:
library(sim1000G)## Loading required package: hapsim## Loading required package: MASS## Loading required package: stringrdownload.file("https://adimitromanolakis.github.io/sim1000G/data/region.vcf.gz", destfile = "region.vcf.gz")
vcf = readVCF("region.vcf.gz", maxNumberOfVariants = 100 , min_maf = 0.02 , max_maf = NA)
#downloadGeneticMap(4)
readGeneticMap( chromosome = 4 )
startSimulation(vcf, totalNumberOfIndividuals = 1200)## [#.......] Reading VCF file..
## [##......] Chromosome:   4  Mbp:  77.35631  Region Size:  348.083 kb  Num of variants: 9038 
## [###.....] Filtering and thinning variants
## [##......] Chromosome:   4  Mbp:  77.35692  Region Size:  338.941 kb  Num of variants: 100 (after filtering)
##       -> Genetic map has 211115 entries
## [1] 0
## [#####...] Creating SIM object
## [#####...] Haplodata object createdFor related individuals in pedigrees, we simulate meiotic recombination by using the function SIM$mate(i,j).
Below we show an example on how to simulate 100 families with 2 offspring each.
In addition we write the output to a PED/MAP file in plink format, for further analysis.
# Simulate one family with 2 offspring
fam = newFamilyWithOffspring("fam1",2)
print(fam)
# Simulate 100 families
 
SIM$reset()
## For testing the IBD, we set the cM so that the regions spans to 4000cm
## Remove for normal use
SIM$cm = seq( 0,4000, length = length(SIM$cm) )
time100families = function() {
    
    
    fam = lapply(1:10, function(x) newFamilyWithOffspring(x,2) )
    fam = do.call(rbind, fam)
    fam
}
fam <- time100families() 
writePED(vcf, fam,"/tmp/out")## Adding individual  31  from pool
## Adding individual  32  from pool
## Adding individual  33  from specified genotypes
## Adding individual  34  from specified genotypes
##    fid id father mother sex gtindex
## 1 fam1  1      0      0   1      31
## 2 fam1  2      0      0   2      32
## 3 fam1 11      1      2   1      33
## 4 fam1 12      1      2   1      34
## Adding individual  1  from pool
## Adding individual  2  from pool
## Adding individual  3  from specified genotypes
## Adding individual  4  from specified genotypes
## Adding individual  5  from pool
## Adding individual  6  from pool
## Adding individual  7  from specified genotypes
## Adding individual  8  from specified genotypes
## Adding individual  9  from pool
## Adding individual  10  from pool
## Adding individual  11  from specified genotypes
## Adding individual  12  from specified genotypes
## Adding individual  13  from pool
## Adding individual  14  from pool
## Adding individual  15  from specified genotypes
## Adding individual  16  from specified genotypes
## Adding individual  17  from pool
## Adding individual  18  from pool
## Adding individual  19  from specified genotypes
## Adding individual  20  from specified genotypes
## Adding individual  21  from pool
## Adding individual  22  from pool
## Adding individual  23  from specified genotypes
## Adding individual  24  from specified genotypes
## Adding individual  25  from pool
## Adding individual  26  from pool
## Adding individual  27  from specified genotypes
## Adding individual  28  from specified genotypes
## Adding individual  29  from pool
## Adding individual  30  from pool
## Adding individual  31  from specified genotypes
## Adding individual  32  from specified genotypes
## Adding individual  33  from pool
## Adding individual  34  from pool
## Adding individual  35  from specified genotypes
## Adding individual  36  from specified genotypes
## Adding individual  37  from pool
## Adding individual  38  from pool
## Adding individual  39  from specified genotypes
## Adding individual  40  from specified genotypes
## [] PED file written as  /tmp/out.pedThe simulator tracks the locations of all the ancestral alleles in 2 seperate arrays. These can be used to compute the IBD1,2 matrices, in arbitrary pedigrees.
Unfortunately, tracking the ancestral alleles makes the simulator a lot slower, so if we don’t need this functionality, we can remove it later.
n = SIM$individuals_generated
IBD1matrix = 
sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [1]) 
        names(z) = 1:n
        z
})
IBD2matrix = 
    sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [2]) 
        names(z) = 1:n
        z
    })colnames(IBD1matrix) = 1:nrow(IBD1matrix)
rownames(IBD1matrix) = 1:nrow(IBD1matrix)
colnames(IBD2matrix) = 1:nrow(IBD2matrix)
rownames(IBD2matrix) = 1:nrow(IBD2matrix)
knitr::kable(IBD1matrix[1:8,1:8] )| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 
|---|---|---|---|---|---|---|---|
| 0 | 0 | 1.0 | 1.0 | 0 | 0 | 0.00 | 0.00 | 
| 0 | 0 | 1.0 | 1.0 | 0 | 0 | 0.00 | 0.00 | 
| 1 | 1 | 0.0 | 0.5 | 0 | 0 | 0.00 | 0.00 | 
| 1 | 1 | 0.5 | 0.0 | 0 | 0 | 0.00 | 0.00 | 
| 0 | 0 | 0.0 | 0.0 | 0 | 0 | 1.00 | 1.00 | 
| 0 | 0 | 0.0 | 0.0 | 0 | 0 | 1.00 | 1.00 | 
| 0 | 0 | 0.0 | 0.0 | 1 | 1 | 0.00 | 0.43 | 
| 0 | 0 | 0.0 | 0.0 | 1 | 1 | 0.43 | 0.00 | 
colnames(IBD1matrix) = 1:nrow(IBD1matrix)
rownames(IBD1matrix) = 1:nrow(IBD1matrix)
colnames(IBD2matrix) = 1:nrow(IBD2matrix)
rownames(IBD2matrix) = 1:nrow(IBD2matrix)
knitr::kable(IBD2matrix[1:8,1:8] )| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 
|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.00 | 0.00 | 0 | 0 | 0.00 | 0.00 | 
| 0 | 1 | 0.00 | 0.00 | 0 | 0 | 0.00 | 0.00 | 
| 0 | 0 | 1.00 | 0.22 | 0 | 0 | 0.00 | 0.00 | 
| 0 | 0 | 0.22 | 1.00 | 0 | 0 | 0.00 | 0.00 | 
| 0 | 0 | 0.00 | 0.00 | 1 | 0 | 0.00 | 0.00 | 
| 0 | 0 | 0.00 | 0.00 | 0 | 1 | 0.00 | 0.00 | 
| 0 | 0 | 0.00 | 0.00 | 0 | 0 | 1.00 | 0.32 | 
| 0 | 0 | 0.00 | 0.00 | 0 | 0 | 0.32 | 1.00 | 
The function to generate family data can be extended to simulate arbitraty pedigrees, it is shown below:
newFamilyWithOffspring = function(familyid, noffspring = 2) {
    
    fam = data.frame(fid = familyid  , 
                     id = c(1:2) , 
                     father = c(0,0), 
                     mother = c(0,0), 
                     sex = c(1,2)
    )
    
    
    j1 = SIM$addUnrelatedIndividual()
    j2 = SIM$addUnrelatedIndividual()
    
    fam$gtindex = c(j1,j2) # Holds the genotype position in the arrays SIM$gt1 and SIM$gt2
    
    for(i in 1:noffspring) {
        j3 = SIM$mate(j1,j2)
        
        newFamilyMember = c(familyid, i+10, 1,2, 1 , j3)
        fam = rbind(fam, newFamilyMember)
    }
    
    return (fam)
}In this example, we generate a pedigree with 6 individuals, across 3 generations. After that, we compute the IBD matrices of the family.
# Reset simulation
SIM$reset()
# Set the region size in cM (0-4000cm, for testing the correctness of the function)
SIM$cm = seq(0,4000,l=length(SIM$cm))
A = SIM$addUnrelatedIndividual()
B = SIM$addUnrelatedIndividual()
C = SIM$mate(A,B)
D = SIM$mate(A,B)
G = SIM$addUnrelatedIndividual()
E = SIM$mate(G,C)
computePairIBD12(C,D)
computePairIBD12(E,A)
n = SIM$individuals_generated
IBD1matrix = 
sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [1]) 
        names(z) = 1:n
        z
})
IBD2matrix = 
    sapply(1:n, function(y) {
        z = sapply(1:n, function(x) computePairIBD12(x,y) [2]) 
        names(z) = 1:n
        z
    })
printMatrix(IBD1matrix)## Adding individual  1  from pool
## Adding individual  2  from pool
## Adding individual  3  from specified genotypes
## Adding individual  4  from specified genotypes
## Adding individual  5  from pool
## Adding individual  6  from specified genotypes
## IBD1 IBD2 
## 0.44 0.35 
## IBD1 IBD2 
##  0.6  0.0 
##          [   1]  [   2]  [   3]  [   4]  [   5]  [   6]  
## [   1]    0.000   0.000   1.000   1.000   0.000   0.600  
## [   2]    0.000   0.000   1.000   1.000   0.000   0.400  
## [   3]    1.000   1.000   0.000   0.440   0.000   1.000  
## [   4]    1.000   1.000   0.440   0.000   0.000   0.570  
## [   5]    0.000   0.000   0.000   0.000   0.000   1.000  
## [   6]    0.600   0.400   1.000   0.570   1.000   0.000