Introduction

This file contains some examples of the functions related to the DDC routine. More specifically, DDC and cellMap will be illustrated.

library("cellWise")
library("gridExtra") # has grid.arrange()

# Default options for DDC:
DDCpars = list(fracNA = 0.5, numDiscrete = 3, precScale = 1e-12,
               cleanNAfirst = "automatic", tolProb = 0.99, 
               corrlim = 0.5, combinRule = "wmean",
               returnBigXimp = FALSE, silent = FALSE,
               nLocScale = 25000, fastDDC = FALSE,
               standType = "1stepM", corrType = "gkwls",
               transFun = "wrap", nbngbrs = 100)

# A small list giving the same results:
DDCpars = list(fastDDC = FALSE)

Example with row and column selection

First we illustrate the selection of columns and rows by DDC. This is actually done by the function checkDataSet which is called by DDC.

i = c(1,2,3,4,5,6,7,8,9) 
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii") 
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE) 
V1 = c(1.3,NaN,4.5,2.7,20.0,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,-10,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5) 
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s) 
datafr

##   i name logic   V1    V2  V3  Vna Vdis V0s
## 1 1   aa  TRUE  1.3   2.3   2    1    1 1.0
## 2 2   bb FALSE  NaN    NA Inf   -4    1 1.5
## 3 3   cc  TRUE  4.5   5.0   3    2    2 2.0
## 4 4   dd FALSE  2.7   6.0  -4  NaN    2 2.0
## 5 5   ee FALSE 20.0   7.0   5    3    3 2.0
## 6 6   ff  TRUE  4.4   8.0   6 -Inf    3 2.0
## 7 7   gg  TRUE -2.1   4.0   7   NA    3 2.0
## 8 8   hh  TRUE  1.1 -10.0  -2    6    1 3.0
## 9 9   ii FALSE -5.0   0.5   8    5    2 2.5

DDCdatafr = DDC(datafr,DDCpars)

##  
##  The input data has 9 rows and 9 columns.
##  
##  The input data contained 2 non-numeric columns (variables).
##  Their column names are:
##  
## [1] name  logic
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 7 numeric columns:
##  
## [1] i    V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] i
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 6 columns:
##  
## [1] V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 discrete columns with 3 or fewer values.
##  Their column names are:
##  
## [1] Vdis
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 5 columns:
##  
## [1] V1  V2  V3  Vna V0s
##  
##  The data contained 1 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] V0s
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 4 columns:
##  
## [1] V1  V2  V3  Vna
##  
##  The final data set we will analyze has 9 rows and 4 columns.
##

remX = DDCdatafr$remX; dim(remX)

## [1] 9 4

cellMap(D=remX, R=DDCdatafr$stdResid, rowlabels = 1:nrow(remX), 
        columnlabels = colnames(remX))

plot of chunk unnamed-chunk-3

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

Small generated dataset

set.seed(12345) # for reproducibility
n <- 50; d <- 20
A <- matrix(0.9, d, d); diag(A) = 1
round(A[1:10,1:10],1) # true covariance matrix

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [2,]  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [3,]  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [4,]  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9   0.9
##  [5,]  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9   0.9
##  [6,]  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9   0.9
##  [7,]  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9   0.9
##  [8,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9   0.9
##  [9,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0   0.9
## [10,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   1.0

library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 50, FALSE)] <- 10
x[sample(1:(n * d), 50, FALSE)] <- -10

# When not specifying DDCpars in the call to DDC
# all defaults are used:
DDCx <- DDC(x)

##  
##  The input data has 50 rows and 20 columns.

cellMap(D=DDCx$remX, R=DDCx$stdResid, columnlabels = 1:d, 
        rowlabels = 1:n)

plot of chunk unnamed-chunk-4

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

DDCx$DDCpars # These are the default options:

## $fracNA
## [1] 0.5
## 
## $numDiscrete
## [1] 3
## 
## $precScale
## [1] 1e-12
## 
## $cleanNAfirst
## [1] "automatic"
## 
## $tolProb
## [1] 0.99
## 
## $corrlim
## [1] 0.5
## 
## $combinRule
## [1] "wmean"
## 
## $returnBigXimp
## [1] FALSE
## 
## $silent
## [1] FALSE
## 
## $nLocScale
## [1] 25000
## 
## $fastDDC
## [1] FALSE
## 
## $standType
## [1] "1stepM"
## 
## $corrType
## [1] "gkwls"
## 
## $transFun
## [1] "wrap"
## 
## $nbngbrs
## [1] 100

names(DDCx)

##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"   "namesNotNumeric"
##  [5] "namesCaseNumber" "namesNAcol"      "namesNArow"      "namesDiscrete"  
##  [9] "namesZeroScale"  "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"           "robcors"        
## [17] "robslopes"       "deshrinkage"     "Xest"            "scalestres"     
## [21] "stdResid"        "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"          "indNAs"         
## [29] "Ximp"

# We will now go through these outputs one by one:

DDCx$colInAnalysis # all columns X1,...,X20 remain:

##  X1  X2  X3  X4  X5  X6  X7  X8  X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20

DDCx$rowInAnalysis # all rows 1,...,50 remain:

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

DDCx$namesNotNumeric # no non-numeric columns:

## NULL

DDCx$namesCaseNumber # no column was the case number:

## NULL

DDCx$namesNAcol # no columns with too many NA's:

## NULL

DDCx$namesNArow # no columns with too many NA's:

## NULL

DDCx$namesDiscrete # no discrete columns:

## NULL

DDCx$namesZeroScale # no columns with scale = 0:

## NULL

dim(DDCx$remX) # remaining data matrix

## [1] 50 20

round(DDCx$locX,2) # robust location estimates of all 20 columns:

##    X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13 
## -0.13 -0.50 -0.20 -0.19 -0.19 -0.16 -0.62 -0.35 -0.47 -0.24 -0.34 -0.43 -0.26 
##   X14   X15   X16   X17   X18   X19   X20 
## -0.28 -0.31 -0.31 -0.38 -0.16 -0.36 -0.27

round(DDCx$scaleX,2) # robust scale estimates of all 20 columns:

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
## 1.42 1.34 1.12 1.25 1.28 1.15 1.13 1.29 1.17 1.11 1.25 1.49 1.18 1.57 1.37 1.37 
##  X17  X18  X19  X20 
## 1.36 1.47 1.26 1.23

round(DDCx$Z[1:10,1:10],1)

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,] -0.5 -0.5 -0.4 -0.3  0.0 -0.1 -0.2 -0.5  0.3   0.2
##  [2,]  0.0  0.1 -0.6 -0.2 -0.7 -8.5 -0.2 -0.9 -0.1  -0.3
##  [3,]  0.2  0.4  0.5  0.4  0.2  0.2  0.4  0.7   NA  -0.2
##  [4,]  0.4  0.6  0.2   NA   NA  0.5  0.7 -7.5  0.8   0.7
##  [5,] -0.6  0.0   NA -0.2 -0.2 -0.3  0.7 -0.4 -0.2  -0.7
##  [6,]  1.5  1.6  1.6  1.5  1.3  1.7  2.5  1.3  1.7   2.1
##  [7,]   NA -0.1 -0.3 -0.6 -0.7 -0.4 -0.1  8.0  0.0   0.0
##  [8,]  0.4  0.5  0.9  0.2  0.6  0.5 -8.3  0.5  0.6   0.1
##  [9,]  0.4  0.4  0.3  0.6  0.5  0.6  0.4  0.2  0.1   0.5
## [10,]  0.3  0.9  1.0  0.5  1.2  0.6   NA  1.1  1.8   0.9

# Robustly standardized dataset. Due to the high correlations,
# cells in the same row look similar (except for outlying cells).

DDCx$nbngbrs

## [1] 19

# For each column the code looked for up to 19 non-self neighbors (highly correlated columns).
# It goes through all of them, unless fastDDC is set to TRUE.

DDCx$ngbrs[1:3,]

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1   18    4   19   14   16   11   13    6    12    10     5    17    20
## [2,]    2   15    4   19   16   12   13    9    7    11    17     6     8     5
## [3,]    3    5   13    4   20   14    6   16    1    17    15     2    19    18
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]    15     2     3     8     9     7
## [2,]    14    20     1     3    10    18
## [3,]    11     9    12     8     7    10

# Shows the neighbors, e.g. the nearest non-self neighbor of X1 is X11, then X2,...

round(DDCx$robcors[1:3,],2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1 0.95 0.93 0.92 0.92 0.92 0.92 0.92 0.91  0.91  0.91  0.91  0.91  0.90
## [2,]    1 0.94 0.94 0.93 0.93 0.93 0.93 0.93 0.92  0.92  0.92  0.91  0.91  0.91
## [3,]    1 0.94 0.93 0.93 0.92 0.92 0.91 0.90 0.89  0.89  0.89  0.89  0.89  0.88
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.90  0.90  0.89  0.89  0.87  0.86
## [2,]  0.91  0.90  0.90  0.89  0.86  0.85
## [3,]  0.88  0.87  0.87  0.86  0.86  0.86

# Robust correlations with these neighbors. In each row the correlations
# are sorted by decreasing absolute value. 

round(DDCx$robslopes[1:3,],2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1 0.97 0.81 0.91 0.97 0.82 0.78 0.78 0.78  0.91  0.71  0.79  0.80  0.82
## [2,]    1 0.95 0.91 0.99 0.91 1.03 0.90 0.88 0.76  0.83  0.87  0.85  0.94  0.87
## [3,]    1 1.09 1.03 0.97 1.08 1.31 0.96 1.02 1.09  1.01  1.04  0.99  1.03  1.22
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.82  0.83  0.74  0.87  0.72  0.69
## [2,]  1.05  0.92  0.97  0.80  0.82  0.95
## [3,]  0.95  0.89  1.12  0.98  0.83  0.86

# For each column, the slope of each neighbor predicting it.
# For instance, X1 is predicted by its first neighbor with 
# slope 0.97 and by its second neighbor with slope 0.81 .

round(DDCx$deshrinkage,2)

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
## 1.07 1.09 1.09 1.11 1.08 1.10 1.07 1.09 1.09 1.11 1.09 1.08 1.09 1.09 1.09 1.09 
##  X17  X18  X19  X20 
## 1.11 1.08 1.09 1.11

# For each column, the factor by which its prediction is multiplied.

round(DDCx$Xest[1:12,1:10],2) # the estimated cells of remX:

##        [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
##  [1,] -0.38 -0.76 -0.44 -0.46 -0.44 -0.41 -0.87 -0.60 -0.71 -0.47
##  [2,] -0.50 -0.89 -0.58 -0.60 -0.58 -0.54 -0.98 -0.75 -0.85 -0.60
##  [3,]  0.23 -0.10  0.18  0.21  0.19  0.21 -0.24  0.04 -0.10  0.11
##  [4,]  0.61  0.31  0.54  0.63  0.58  0.58  0.15  0.40  0.29  0.52
##  [5,] -0.43 -0.78 -0.48 -0.49 -0.48 -0.43 -0.87 -0.63 -0.75 -0.54
##  [6,]  1.80  1.58  1.72  1.87  1.78  1.79  1.34  1.57  1.45  1.68
##  [7,] -0.40 -0.78 -0.48 -0.49 -0.49 -0.44 -0.88 -0.63 -0.74 -0.51
##  [8,]  0.39  0.08  0.34  0.38  0.36  0.38 -0.08  0.19  0.07  0.27
##  [9,]  0.40  0.07  0.33  0.39  0.36  0.38 -0.09  0.17  0.06  0.28
## [10,]  0.96  0.70  0.93  1.02  0.97  0.95  0.49  0.80  0.67  0.87
## [11,]  0.25 -0.08  0.18  0.22  0.20  0.22 -0.23  0.04 -0.09  0.16
## [12,] -1.55 -2.00 -1.64 -1.74 -1.67 -1.61 -2.01 -1.77 -1.90 -1.64

round(DDCx$stdResid[1:12,1:10],1)

##       [,1]  [,2] [,3] [,4] [,5]  [,6]  [,7]  [,8]  [,9] [,10]
##  [1,] -1.1  -1.4 -0.4 -0.4  0.6   0.2   0.0  -1.1   1.6   1.3
##  [2,]  0.9   1.6 -0.8  0.6 -1.4 -24.1   0.3  -2.0   0.8   0.2
##  [3,] -0.3   0.3  0.3  0.3 -0.2  -0.4   0.4   1.2    NA  -1.4
##  [4,] -0.3   0.2 -1.5   NA   NA  -0.5   0.1 -26.2   0.6   0.1
##  [5,] -1.4   0.9   NA  0.2  0.1  -0.2   3.1  -0.7   0.1  -1.3
##  [6,]  0.4   0.4 -0.4 -0.7 -0.9   0.2   2.5  -0.6   0.3   1.0
##  [7,]   NA   0.4 -0.1 -1.9 -1.8  -0.5   0.3  26.8   0.7   0.7
##  [8,] -0.1   0.2  1.2 -1.3  0.7   0.0 -30.1   0.2   0.5  -1.0
##  [9,]  0.2  -0.3 -0.6  0.8  0.2   0.4  -0.2  -0.7  -1.2  -0.1
## [10,] -1.8   0.2 -0.1 -2.8  1.0  -1.0    NA   0.8   2.7  -0.3
## [11,]   NA   1.2 -1.4 -0.6 -0.6  -0.9   0.8   1.0  -0.2   1.2
## [12,] -0.1 -25.4 -1.3 -0.1 -1.2  -1.2   0.8   2.2 -22.8   0.3

# The standardized residuals of the cells. Note the NA's and some
# large positive and negative cell residuals.

qqnorm(as.vector(DDCx$stdResid)) # Note the far outliers on both sides:

plot of chunk unnamed-chunk-6

as.vector(DDCx$indcells) # indices of the cells that were flagged by DDC:

##   [1]  27  34  35  41  62  76  79  80  90  93 100 142 146 160 164 178 180 195
##  [19] 197 219 225 245 252 267 273 281 284 292 297 299 305 308 337 342 354 357
##  [37] 367 388 396 397 410 412 430 433 439 465 482 506 529 534 548 553 561 564
##  [55] 566 585 594 597 628 631 643 648 654 663 669 675 685 696 699 704 705 715
##  [73] 728 736 754 757 758 760 784 806 816 817 831 836 843 868 881 886 887 896
##  [91] 904 905 910 921 925 942 950 956 978 981 992 996

plot(DDCx$Ti) # the Ti values of the rows. None are high.

plot of chunk unnamed-chunk-6

qqnorm(DDCx$Ti)

plot of chunk unnamed-chunk-6

DDCx$medTi # median of the raw Ti (used to standardize Ti):

## [1] 0.009748113

DDCx$madTi # median absolute deviation of the raw Ti (used to standardize Ti):

## [1] 0.05335538

DDCx$indrows # numeric(0) means no rows are flagged:

## [1] 47

as.vector(DDCx$indall) # indices of the flagged cells, including those in flagged rows:

##   [1]  27  34  35  41  47  62  76  79  80  90  93  97 100 142 146 147 160 164
##  [19] 178 180 195 197 219 225 245 247 252 267 273 281 284 292 297 299 305 308
##  [37] 337 342 347 354 357 367 388 396 397 410 412 430 433 439 447 465 482 497
##  [55] 506 529 534 547 548 553 561 564 566 585 594 597 628 631 643 647 648 654
##  [73] 663 669 675 685 696 697 699 704 705 715 728 736 747 754 757 758 760 784
##  [91] 797 806 816 817 831 836 843 847 868 881 886 887 896 897 904 905 910 921
## [109] 925 942 947 950 956 978 981 992 996 997

as.vector(DDCx$indNAs) # indices of the missing cells:

##  [1]   7  11  28  44  97 105 154 174 176 184 204 249 264 310 348 403 427 443 489
## [20] 491 517 523 550 556 589 602 606 609 615 624 645 646 658 700 725 798 803 814
## [39] 867 870 945 954 979 980 999

round(DDCx$Ximp[1:10,1:10],2)

##        [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
##  [1,] -0.81 -1.21 -0.59 -0.56 -0.23 -0.33 -0.86 -1.05 -0.15  0.01
##  [2,] -0.15 -0.38 -0.87 -0.47 -1.07 -0.54 -0.89 -1.56 -0.57 -0.53
##  [3,]  0.12 -0.01  0.31  0.28  0.13  0.04 -0.12  0.52 -0.10 -0.43
##  [4,]  0.48  0.36 -0.02  0.63  0.58  0.40  0.18  0.40  0.50  0.55
##  [5,] -0.97 -0.50 -0.48 -0.45 -0.45 -0.51 -0.87 -0.90 -0.70 -1.05
##  [6,]  1.94  1.71  1.59  1.70  1.47  1.85  2.15  1.32  1.54  2.08
##  [7,] -0.40 -0.67 -0.53 -0.94 -1.11 -0.64 -0.77 -0.63 -0.48 -0.25
##  [8,]  0.36  0.15  0.78  0.08  0.60  0.39 -0.08  0.26  0.25 -0.13
##  [9,]  0.50 -0.02  0.10  0.57  0.44  0.55 -0.14 -0.09 -0.36  0.26
## [10,]  0.28  0.77  0.87  1.02  1.32  0.55  0.49  1.12  0.67  0.76

# The imputed matrix. Both the cellwise outliers and the missing values
# are replaced by their predicted values.

round((DDCx$Ximp - DDCx$remX)[1:10,1:10],2)

##    X1 X2 X3   X4 X5   X6    X7     X8    X9 X10
## 1   0  0  0 0.00  0 0.00  0.00   0.00  0.00   0
## 2   0  0  0 0.00  0 9.46  0.00   0.00  0.00   0
## 3   0  0  0 0.00  0 0.00  0.00   0.00    NA   0
## 4   0  0  0   NA NA 0.00  0.00  10.40  0.00   0
## 5   0  0 NA 0.00  0 0.00 -1.01   0.00  0.00   0
## 6   0  0  0 0.00  0 0.00  0.00   0.00  0.00   0
## 7  NA  0  0 0.00  0 0.00  0.00 -10.63  0.00   0
## 8   0  0  0 0.00  0 0.00  9.92   0.00  0.00   0
## 9   0  0  0 0.00  0 0.00  0.00   0.00  0.00   0
## 10  0  0  0 0.65  0 0.00    NA   0.00 -0.95   0

# The nonzero values and the NA's correspond to imputed cells.

TopGear dataset

The Top Gear data contains information on 297 cars.

library(robustHD)
data(TopGear)

dim(TopGear)

## [1] 297  32

rownames(TopGear)[1:13] # "1" to "297" are not useful names

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"

rownames(TopGear) = paste(TopGear[,1],TopGear[,2]) 
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'

# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)

# Run the DDC method:
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCtransTG = DDC(transTG,DDCpars)

##  
##  The final data set we will analyze has 296 rows and 11 columns.
##

# With DDCpars = list(fastDDC = FALSE, silent = FALSE) we obtain more information:
# 
# The input data has 297 rows and 31 columns.
# 
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#   
#  [1] Maker         Model              Type               Fuel              
#  [5] DriveWheel    AdaptiveHeadlights AdjustableSteering AlarmSystem       
#  [9] Automatic     Bluetooth          ClimateControl     CruiseControl     
# [13] ElectricSeats Leather            ParkingSensors     PowerSteering     
# [17] SatNav        ESP                Origin            
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#   
# [1] Price Cylinders Displacement BHP   Torque Acceleration TopSpeed    
# [8] MPG   Weight    Length       Width Height      
# 
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#   
#   [1] Citroen C5 Tourer
# 
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#   
#   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
# .......
# [295] Volvo XC70                       Volvo XC90                      
# 
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
# 
# [1] Cylinders
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
# 
# [1] Price  Displacement BHP   Torque Acceleration TopSpeed MPG         
# [8] Weight Length       Width Height      
# 
# The final data set we will analyze has 296 rows and 11 columns.

remX = DDCtransTG$remX # the remaining part of the dataset
dim(remX)

## [1] 296  11

colSums(is.na(remX)) # There are still NAs, mainly in `Weight':

##        Price Displacement          BHP       Torque Acceleration     TopSpeed 
##            0            8            3            3            0            3 
##          MPG       Weight       Length        Width       Height 
##           11           32           10           15           10

# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE), 
               apply(remX,2,mad,na.rm = TRUE))
dim(standX)

## [1] 296  11

round(standX[1:5,],1) # has NAs where remX does

##                          Price Displacement  BHP Torque Acceleration TopSpeed
## Alfa Romeo Giulietta      -0.4         -0.4 -0.7    0.0          0.6     -0.5
## Alfa Romeo MiTo           -0.9         -0.7 -0.7   -1.6          0.4     -0.4
## Aston Martin Cygnet        0.3         -0.8 -0.8   -1.7          0.7     -0.9
## Aston Martin DB9           2.7          2.1  1.9    1.2         -1.2      2.0
## Aston Martin DB9 Volante   2.8          2.1  1.9    1.2         -1.2      2.0
##                           MPG Weight Length Width Height
## Alfa Romeo Giulietta      1.0   -0.3   -0.3  -0.2   -0.2
## Alfa Romeo MiTo           0.1   -1.0   -0.9  -1.1   -0.3
## Aston Martin Cygnet       0.6   -1.3   -3.1  -1.5    0.1
## Aston Martin DB9         -1.7    0.8    0.6    NA   -1.6
## Aston Martin DB9 Volante -1.7    1.0    0.6    NA   -1.6

transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # has NAs in outlying cells as well:

##                          Price Displacement BHP Torque Acceleration TopSpeed
## Alfa Romeo Giulietta      10.0          7.4 4.7    5.5         11.3      4.7
## Alfa Romeo MiTo            9.6          7.2 4.7    4.6         10.7      4.8
## Aston Martin Cygnet       10.3          7.2 4.6    4.5         11.8      4.7
## Aston Martin DB9            NA          8.7 6.2    6.1          4.6      5.2
## Aston Martin DB9 Volante    NA          8.7 6.2    6.1          4.6      5.2
##                          MPG Weight Length Width Height
## Alfa Romeo Giulietta      64   1385   4351  1798   1465
## Alfa Romeo MiTo           49   1090   4063  1720   1446
## Aston Martin Cygnet       56    988     NA  1680   1500
## Aston Martin DB9          19   1785   4720    NA   1282
## Aston Martin DB9 Volante  19   1890   4720    NA   1282

# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
dim(tempX)

## [1] 296  11

columnlabels = colnames(tempX)
rowlabels = rownames(tempX)
# Show the following 17 cars in the cellmap:
showrows = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)

# Make two ggplot2 objects:
ggpcol = cellMap(D=tempX,
                 R=standX,
                 indcells=which(is.na(transTGcol)),
                 indrows=integer(0),
                 columnlabels=columnlabels,
                 rowlabels=rowlabels,
                 mTitle="By column",
                 showrows=showrows,
                 showVals="D",
                 adjustrowlabels=0.5) 
plot(ggpcol)

plot of chunk unnamed-chunk-9

ggpDDC = cellMap(D=tempX,
                 R=DDCtransTG$stdResid, 
                 indcells=DDCtransTG$indcells,
                 indrows=DDCtransTG$indrows,
                 columnlabels=columnlabels,
                 rowlabels=rowlabels,
                 mTitle="DetectDeviatingCells",
                 showrows=showrows,
                 showVals="D",
                 adjustrowlabels=0.5)
plot(ggpDDC)

plot of chunk unnamed-chunk-10

# Creating the pdf:
# pdf("cellMap_TopGear.pdf", width = 20, height = 15 )
# gridExtra::grid.arrange(ggpcol,ggpDDC,nrow=1) # combines 2 plots in a figure
# dev.off()

Analyzing new data by DDCpredict

We now consider the 17 cars shown in the cellmap as a new' dataset.

# Top Gear dataset: prediction of "new" data
############################################
# For comparison we first remake the cell map of the entire dataset, but now 
# showing the values of the residuals instead of the data values:

dim(remX) # 296 11

## [1] 296  11

rowlabels = rownames(remX)
columnlabels = colnames(remX)

ggpDDC = cellMap(D=remX,
                 R=DDCtransTG$stdResid,
                 indcells=DDCtransTG$indcells,
                 indrows=DDCtransTG$indrows,
                 standOD=NULL,
                 showVals="R",
                 rowlabels=rowlabels,
                 columnlabels=columnlabels,
                 mTitle="DDC",
                 showrows=showrows, 
                 adjustrowlabels=0.5, 
                 outlyingGrad = 1)
plot(ggpDDC)

plot of chunk unnamed-chunk-11

Define the “initial” dataset as the rows not in these 17:

initX = remX[-showrows,]
dim(initX) # 279 11

## [1] 279  11

# Fit initX:
DDCinitX = DDC(initX,DDCpars=DDCpars)

##  
##  The final data set we will analyze has 278 rows and 11 columns.
##

Define the “new” dataset, and apply DDCpredict to it:

newX = remX[showrows,]
dim(newX) # 17  11

## [1] 17 11

# Make predictions by DDCpredict. 
# Its inputs are:
#   Xnew       : the new data (test data)
#   InitialDDC : Must be provided.
#   DDCpars    : the input options to be used for the prediction.
#                By default the options of InitialDDC are used. 

predictDDC = DDCpredict(newX,DDCinitX)

names(DDCinitX)

##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"   "namesNotNumeric"
##  [5] "namesCaseNumber" "namesNAcol"      "namesNArow"      "namesDiscrete"  
##  [9] "namesZeroScale"  "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"           "robcors"        
## [17] "robslopes"       "deshrinkage"     "Xest"            "scalestres"     
## [21] "stdResid"        "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"          "indNAs"         
## [29] "Ximp"

# For comparison with:

names(predictDDC) # Fewer, since DDCpredict does not call checkDataSet:

##  [1] "DDCpars"     "locX"        "scaleX"      "Z"           "nbngbrs"    
##  [6] "ngbrs"       "robcors"     "robslopes"   "deshrinkage" "Xest"       
## [11] "scalestres"  "stdResid"    "indcells"    "Ti"          "medTi"      
## [16] "madTi"       "indrows"     "indNAs"      "indall"      "Ximp"

# If you specify the parameters the result is the same:
predictDDC2 = DDCpredict(newX,DDCinitX,DDCpars=DDCpars)
all.equal(predictDDC,predictDDC2) # TRUE

## [1] TRUE

ggpnew = cellMap(D=newX,
                 R=predictDDC$stdResid,
                 indcells=predictDDC$indcells,
                 indrows=predictDDC$indrows,
                 standOD=NULL,
                 showVals="R",
                 rowlabels=rowlabels[showrows],
                 columnlabels=columnlabels,
                 mTitle="DDCpredict",
                 adjustrowlabels=0.5, 
                 outlyingGrad = 1)
plot(ggpnew) # Looks quite similar to the result using the entire dataset:

plot of chunk unnamed-chunk-14

# Creating the pdf:
# pdf("TopGear_DDCpredict.pdf",width=20,height=15)
# gridExtra::grid.arrange(ggpDDC,ggpnew,nrow=1) 
# dev.off()

Philips data

The philips data contains 9 measurements of TV parts from a production line.

data(data_philips)
dim(data_philips)

## [1] 677   9

colnames(data_philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")

DDCphilips = DDC(data_philips,DDCpars)

qqnorm(as.vector(DDCphilips$Z)) # rather gaussian, here we only see 2 outliers:

plot of chunk unnamed-chunk-15

round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
##  [1,] -0.9  2.2  0.0  1.2  0.8 -2.9  2.1  0.1 -1.4
##  [2,] -0.5  2.0  0.0  3.9  0.8 -1.1  3.5 -0.4 -1.7
##  [3,] -1.0  2.3 -0.4  0.7  0.6 -2.0  1.4  0.0 -1.7
##  [4,] -0.6  2.3 -0.1  3.8  1.1 -0.9  3.4 -0.5 -2.0
##  [5,] -0.6  2.8 -0.2  3.4  0.9 -0.9  3.4  0.2 -1.9
##  [6,] -0.7  2.6  0.0  4.4  1.0 -1.1  4.2 -0.1 -2.0
##  [7,]  1.0 -0.1 -1.0 -0.3 -1.2 -1.0  1.4  0.8 -0.6
##  [8,] -0.4 -1.4 -0.7  1.8 -0.7 -2.8  1.9 -0.3 -1.7
##  [9,]  1.0 -0.2  0.0 -0.1 -1.2 -2.1  1.6  1.0 -0.9
## [10,]  0.2 -1.3 -0.1  1.0 -0.5 -2.9  2.0 -0.1 -1.5
## [11,] -0.1 -0.3 -0.1  0.2 -1.1 -1.5  1.9  0.8 -0.8
## [12,] -0.1 -0.4 -1.5  2.5  0.0 -3.4  2.3  0.3 -2.5

DDCphilips$indcells # indices of the cells that were flagged:

##   [1]   55   70   95  104  116  144  151  156  161  170  174  175  324  468  682
##  [16]  683  712  747  750  792  985  986  987  988  989  991  992 1006 1007 1008
##  [31] 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035 2036 2037 2045
##  [46] 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365 2464 2499 2592 2743
##  [61] 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393 3395 3397 3399 3401 3412
##  [76] 3413 3414 3415 3416 3417 3423 3424 3425 3426 3427 3428 3430 3431 3437 3445
##  [91] 3456 3457 3460 3464 3465 3466 3468 3469 3470 3990 4064 4066 4067 4068 4076
## [106] 4078 4088 4145 4147 4792 4794 5036 5037 5168 5169 5170 5171 5172 5174 5175
## [121] 5176 5230 5238 5241 5246 5248 5250 5253 5256 5258 5259 5260 5262 5263 5265
## [136] 5267 5268 5270 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569
## [151] 5869 5873 5882 5890 5902

DDCphilips$indrows # flagged rows:

## [1]  26  56  84 334 491

We also apply the rowwise method MCD to detect outlying rows:

library(robustbase) # for covMcd

MCDphilips = robustbase::covMcd(data_philips)
indrowsMCD = which(mahalanobis(data_philips,MCDphilips$center,
                               MCDphilips$cov) > qchisq(0.975,df=d))

plot(sqrt(mahalanobis(data_philips,MCDphilips$center,MCDphilips$cov)),
     main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9))) # this horizontal line is the cutoff.

plot of chunk unnamed-chunk-17

# dev.copy(pdf,"Figure_philips_left.pdf",width=10,height=4)
# dev.off()

# cellMaps with rectangular blocks:

n = nrow(data_philips)
nrowsinblock = 15
rowlabels = 1:n

d = ncol(data_philips)
ncolumnsinblock = 1
columnlabels = colnames(data_philips)

ggpMCDphilips = cellMap(D=data_philips,
                        R=matrix(0,n,d),
                        indcells=integer(0),
                        indrows=indrowsMCD,
                        rowlabels=rowlabels,
                        columnlabels=columnlabels,                        
                        mTitle="MCD",
                        nrowsinblock=nrowsinblock,
                        ncolumnsinblock=ncolumnsinblock,                        
                        autolabel=T)
plot(ggpMCDphilips)

plot of chunk unnamed-chunk-18

ggpDDCphilips = cellMap(D=data_philips, 
                        R=DDCphilips$stdResid,
                        indcells=DDCphilips$indcells,
                        indrows=DDCphilips$indrows,
                        rowlabels=rowlabels,
                        columnlabels=columnlabels,                        
                        mTitle="DetectDeviatingCells",
                        nrowsinblock=nrowsinblock,
                        ncolumnsinblock=ncolumnsinblock,
                        autolabel=T,
                        adjustrowlabels=1)
plot(ggpDDCphilips)

plot of chunk unnamed-chunk-18

# dev.copy(pdf,"Figure_philips_right.pdf",width=6,height=12)
# dev.off()

Mortality dataset

This dataset contains the mortality of French males in the years 1816 to 2013.

data(data_mortality)
dim(data_mortality)

## [1] 198  91

# 198  91
rownames(data_mortality)[1:5]

## [1] "1816" "1817" "1818" "1819" "1820"

colnames(data_mortality)[1:5]

## [1] "0" "1" "2" "3" "4"

DDCmortality = DDC(data_mortality,DDCpars) # 1 second

remX = DDCmortality$remX
dim(remX)

## [1] 198  91

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA

PCAmortality = rrcov::PcaHubert(data_mortality,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowlabels = rownames(remX)

d = ncol(remX)
ncolumnsinblock = 5
columnlabels = colnames(remX)

ggpROBPCA = cellMap(D=remX,
                    R=matrix(0,n,d),
                    indrows=which(PCAmortality@flag==FALSE),
                    rowlabels=rowlabels,
                    columnlabels=columnlabels,                    
                    mTitle="By row",
                    nrowsinblock=nrowsinblock,
                    ncolumnsinblock=ncolumnsinblock, 
                    rowtitle = "Years",
                    columntitle = "Age",
                    sizetitles = 2.0,
                    autolabel=T)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-21

ggpDDC = cellMap(D=remX, 
                 R=DDCmortality$stdResid,
                 indcells=DDCmortality$indcells,
                 indrows=DDCmortality$indrows,
                 rowlabels=rowlabels,
                 columnlabels=columnlabels,                 
                 mTitle="DetectDeviatingCells",
                 nrowsinblock=nrowsinblock,
                 ncolumnsinblock=ncolumnsinblock,
                 rowtitle = "Years",
                 columntitle = "Age",
                 sizetitles = 2.0,
                 autolabel=T)
plot(ggpDDC) # Leads to a detailed interpretation:

plot of chunk unnamed-chunk-21

# pdf("cellmap_mortality.pdf",width=14,height=12)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
# dev.off()

Glass dataset

The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.

data(data_glass)
DDCglass = DDC(data_glass,DDCpars) # takes 8 seconds

##  
##  The final data set we will analyze has 180 rows and 737 columns.
##

remX = DDCglass$remX
# With DDCpars$silent = FALSE we obtain more information:
#
#  The input data has 180 rows and 750 columns.
#
#  The data contained 11 discrete columns with 3 or fewer values.
#  Their column names are:
#
#  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 739 columns:
#  
#   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25
#    ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The data contained 2 columns with zero or tiny median absolute deviation.
#  Their column names are:
#
# [1] V12 V13
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 737 columns:
#
#   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27
#    ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The final data set we will analyze has 180 rows and 737 columns.

dim(remX)

## [1] 180 737

We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:

fastDDCpars = list(fastDDC = TRUE, silent = TRUE)
fastDDCglass = DDC(data_glass, fastDDCpars) # takes 2 seconds

##  
##  The final data set we will analyze has 180 rows and 737 columns.
##

remXfast = fastDDCglass$remX
all.equal(remX,remXfast) # The remaining data is the same:

## [1] TRUE

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA

PCAglass = rrcov::PcaHubert(remX,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowtitle = "glass samples"
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/ncolumnsinblock)] = "n";
# rowlabels

d = ncol(remX)
ncolumnsinblock = 5
columntitle = "wavelengths"
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/nrowsinblock)] = "d"
# columnlabels

ggpROBPCA = cellMap(D=remX, 
                    R=matrix(0,n,d),
                    indcells=integer(0),
                    indrows=which(PCAglass@flag==FALSE),
                    rowlabels=rowlabels,
                    columnlabels=columnlabels,
                    mTitle="By row",
                    nrowsinblock=nrowsinblock,
                    ncolumnsinblock=ncolumnsinblock,
                    columnangle=0,
                    rowtitle=rowtitle,
                    columntitle=columntitle,
                    sizetitles=1.5,
                    autolabel=F)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-25

ggpDDC = cellMap(D=remX, 
                 R=DDCglass$stdResid,
                 indcells=DDCglass$indcells,
                 indrows=DDCglass$indrows,
                 rowlabels=rowlabels,
                 columnlabels=columnlabels,
                 mTitle="DDC",
                 nrowsinblock=nrowsinblock,
                 ncolumnsinblock=ncolumnsinblock,
                 columnangle=0,
                 rowtitle=rowtitle,
                 columntitle=columntitle,
                 sizetitles=1.5,
                 autolabel=F)
plot(ggpDDC)

plot of chunk unnamed-chunk-25

# pdf("cellmap_glass_ROBPCA_DDC.pdf",width=16,height=10)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
# dev.off()

ggpfastDDC = cellMap(D=remXfast, 
                     R=fastDDCglass$stdResid,
                     indcells=fastDDCglass$indcells,
                     indrows=fastDDCglass$indrows,
                     rowlabels=rowlabels,
                     columnlabels=columnlabels,
                     mTitle="fast DDC",
                     nrowsinblock=nrowsinblock,
                     ncolumnsinblock=ncolumnsinblock,
                     columnangle=0,
                     rowtitle=rowtitle,
                     columntitle=columntitle,    
                     sizetitles=1.5,
                     autolabel=F)
plot(ggpfastDDC)

plot of chunk unnamed-chunk-25

# pdf("cellmap_glass_DDC_fastDDC.pdf",width=16,height=10)
# gridExtra::grid.arrange(ggpDDC,ggpfastDDC,ncol=1)
# dev.off()