Introduction

This vignette visualizes classification results from a random forest, using tools from the package.

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(classmap)

Instagram training data

We use the Instagram data to illustrate the visualization of a random forest classification. The data is on the identification of genuine/fake (spam) accounts on Instagram. The original data source is: https://www.kaggle.com/free4ever1/instagram-fake-spammer-genuine-accounts from Bardiya Bakhshandeh.

First we load and inspect the data.

data("data_instagram")
traindata <- data_instagram[which(data_instagram$dataType == "train"), -13]
str(traindata)

## 'data.frame':    576 obs. of  12 variables:
##  $ profile.pic         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ nums.length.username: num  0.27 0 0.1 0 0 0 0 0 0 0 ...
##  $ fullname.words      : int  0 2 2 1 2 4 2 2 0 2 ...
##  $ nums.length.fullname: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ name..username      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ description.length  : int  53 44 0 82 0 81 50 0 71 40 ...
##  $ external.URL        : int  0 0 0 0 0 1 0 0 0 1 ...
##  $ private             : int  0 0 1 0 1 0 0 0 0 0 ...
##  $ X.posts             : int  32 286 13 679 6 344 16 33 72 213 ...
##  $ X.followers         : int  1000 2740 159 414 151 669987 122 1078 1824 12945 ...
##  $ X.follows           : int  955 533 98 651 126 150 177 76 2713 813 ...
##  $ y                   : Factor w/ 2 levels "genuine","fake": 1 1 1 1 1 1 1 1 1 1 ...

# The variable names and their interpretation are
colnames(traindata)

##  [1] "profile.pic"          "nums.length.username" "fullname.words"      
##  [4] "nums.length.fullname" "name..username"       "description.length"  
##  [7] "external.URL"         "private"              "X.posts"             
## [10] "X.followers"          "X.follows"            "y"

# profile.pic: binary, indicates whether profile has picture
# nums.length.username: ratio of number of numerical chars in username to its length
# fullname.words: number of words in full name
# nums.length.fullname: ratio of number of numerical characters in full name to its length
# name..username: binary, indicates whether name == username of the profile
# description.length: length of the description/biography of the profile (in number of characters)
# external.URL: binary, indicates whether profile has external url
# private: binary, indicates whether profile is private or not
# X.posts: number of posts made by profile
# X.followers: number of followers
# X.follows: numbers of follows
# y: whether profile is fake or not.

x_train <- traindata[, -12]
y_train <- traindata[, 12]

dim(traindata)

## [1] 576  12

table(traindata$y) # 50/50 split of genuine/fake accounts:

## 
## genuine    fake 
##     288     288

Now we train a random forest. We set the seed as it is not deterministic.

set.seed(71) 
rfout <- randomForest(y ~ ., data = traindata, keep.forest = TRUE)

Now we create a list called mytype which describes the types of the variables in the data. The variables that are not listed will be interval-scaled by default. The Instagram data contains mostly numeric variables and 4 symmetric binary variables.

mytype <- list(symm = c(1, 5, 7, 8))

Now we prepare for the visualization of the random forest classification.

vcrtrain <- vcr.forest.train(X = x_train, y = y_train,
                            trainfit = rfout, type = mytype)

names(vcrtrain)

##  [1] "X"         "yint"      "y"         "levels"    "predint"   "pred"     
##  [7] "altint"    "altlab"    "PAC"       "figparams" "fig"       "farness"  
## [13] "ofarness"  "trainfit"

vcrtrain$predint[c(1:10, 301:310)] # prediction as integer

##   1   2   3   4   5   6   7   8   9  10 301 302 303 304 305 306 307 308 309 310 
##   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2

vcrtrain$pred[c(1:10, 301:310)]    # prediction as label

##  [1] "genuine" "genuine" "genuine" "genuine" "genuine" "genuine" "genuine"
##  [8] "genuine" "genuine" "genuine" "fake"    "fake"    "fake"    "fake"   
## [15] "fake"    "fake"    "fake"    "fake"    "fake"    "fake"

vcrtrain$altint[c(1:10, 301:310)]  # alternative label as integer

##  [1] 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1

vcrtrain$altlab[c(1:10, 301:310)]  # alternative label

##  [1] "fake"    "fake"    "fake"    "fake"    "fake"    "fake"    "fake"   
##  [8] "fake"    "fake"    "fake"    "genuine" "genuine" "genuine" "genuine"
## [15] "genuine" "genuine" "genuine" "genuine" "genuine" "genuine"

# Probability of Alternative Class (PAC) of each object:
vcrtrain$PAC[1:3]

## [1] 0.124 0.000 0.036

#
summary(vcrtrain$PAC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.01200 0.04935 0.05600 0.48400

# f(i, g) is the distance from case i to class g:
vcrtrain$fig[1:3, ] # for the first 3 objects:

##           [,1]      [,2]
## [1,] 0.8896095 0.9053458
## [2,] 0.2818389 0.9392232
## [3,] 0.6013421 0.5258344

# The farness of an object i is the f(i, g) to its own class: 
vcrtrain$farness[1:3]

## [1] 0.8896095 0.2818389 0.6013421

#
summary(vcrtrain$farness)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02329 0.24326 0.50000 0.49483 0.74443 0.99996

# The "overall farness" of an object is defined as the 
# lowest f(i, g) it has to any class g (including its own):
summary(vcrtrain$ofarness)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02329 0.22795 0.46616 0.47415 0.70684 0.99996

sum(vcrtrain$ofarness > 0.99, na.rm = TRUE)

## [1] 6

# With the default cutoff = 0.99 we find 6 outliers,
# also shown in the last column of the confusion matrix:

confmat.vcr(vcrtrain)

## 
## Confusion matrix:
##          predicted
## given     genuine fake outl
##   genuine     283    0    5
##   fake          0  287    1
## 
## The accuracy is 100%.

# If we do not want to show the outliers:
confmat.vcr(vcrtrain, showOutliers = FALSE)

## 
## Confusion matrix:
##          predicted
## given     genuine fake
##   genuine     288    0
##   fake          0  288
## 
## The accuracy is 100%.

# Note that the accuracy is computed before any objects
# are flagged, so it does not depend on the cutoff.
# Here the accuracy is `perfect' due to overfitting. 
# The out-of-box prediction accuracy is about 92%.
cols <- c("blue", "red3")

Now we can use the visualization tools from this package.

stackedplot(vcrtrain, classCols = cols, main =
              "Instagram training data")

plot of chunk unnamed-chunk-7

# Silhouette plot:
silplot(vcrtrain, classCols = cols)

##  classNumber classLabel classSize classAveSi
##            1    genuine       288        0.9
##            2       fake       288        0.9

plot of chunk unnamed-chunk-7

# Here all the s(i) are nonnegative (due to overfitting).

# Class maps:
classmap(vcrtrain, "genuine", classCols = cols) #, identify = TRUE)

plot of chunk unnamed-chunk-7

# farness outliers from furthest to closer: 45, 25, 41
x_train[c(45, 25, 41), ] # they have huge numbers of followers.

##    profile.pic nums.length.username fullname.words nums.length.fullname
## 45           1                    0              4                    0
## 25           1                    0              0                    0
## 41           1                    0              0                    0
##    name..username description.length external.URL private X.posts X.followers
## 45              0                 35            0       0    4494    12397719
## 25              0                  0            0       0     148    15338538
## 41              0                  2            0       0    7389      890969
##    X.follows
## 45         8
## 25        61
## 41        11

classmap(vcrtrain, "fake", classCols = cols) #, identify = TRUE)

plot of chunk unnamed-chunk-7

# only case 261 is borderline far.

The classification of the training data is not very realistic due to overfitting, so let us look at the test data.

Instagram test data

Now we consider the test data. First we load the data.

testdata <- data_instagram[which(data_instagram$dataType == "test"), -13]
Xnew <- testdata[, -12]
ynew <- testdata[, 12]

We can now prepare for visualization:

vcrtest <- vcr.forest.newdata(Xnew, ynew, vcrtrain)

confmat.vcr(vcrtest)

## 
## Confusion matrix:
##          predicted
## given     genuine fake outl
##   genuine      54    5    1
##   fake          5   54    1
## 
## The accuracy is 91.67%.

First we visualize using the stacked plot and the silhouette plot:

stackedplot(vcrtest, classCols = cols, 
            main = "RF on Instagram test data")

plot of chunk unnamed-chunk-10

# Silhouette plot:
silplot(vcrtest, classCols = cols, main =
          "Silhouettes of RF on Instagram test data") # now some s(i) are negative

##  classNumber classLabel classSize classAveSi
##            1    genuine        60       0.75
##            2       fake        60       0.75

plot of chunk unnamed-chunk-10

Now we make the class maps

## Class of genuine accounts:

classmap(vcrtest, "genuine", classCols = cols) #, identify = TRUE)