basic usage

Basic usage

First, let’s create 5 clusters normally distributed around 1 to 5, with sd of 0.3:

data <- simulate_data(n=100, sd=0.3, nclust=5, dims=2)
data

## # A tibble: 500 × 4
##       id    V1    V2 true_clust
##    <int> <dbl> <dbl>      <int>
##  1     1 0.812 1.46           1
##  2     2 1.02  1.22           1
##  3     3 1.39  1.31           1
##  4     4 1.42  0.891          1
##  5     5 0.914 1.48           1
##  6     6 0.263 0.942          1
##  7     7 0.613 1.04           1
##  8     8 1.27  1.05           1
##  9     9 0.743 0.758          1
## 10    10 1.14  0.887          1
## # … with 490 more rows

This is how our data looks like:

data %>% ggplot(aes(x=V1, y=V2, color=factor(true_clust))) + 
    geom_point() + 
    scale_color_discrete(name='true cluster')

Now we can cluster it using kmeans++:

data_for_clust <- data %>% select(id, starts_with('V'))
km <- TGL_kmeans_tidy(data_for_clust,
              k=5, 
              metric='euclid', 
              verbose=TRUE)

## id column: id

## KMEans: will generate seeds
## KMeans into generate seeds
## at seed 0
## add new core from 1 to 0
## at seed 1
## done update min distance
## seed range 350 450
## picked up 473 dist was 2.90567
## add new core from 473 to 1
## at seed 2
## done update min distance
## seed range 300 400
## picked up 273 dist was 1.38198
## add new core from 273 to 2
## at seed 3
## done update min distance
## seed range 250 350
## picked up 184 dist was 0.665351
## add new core from 184 to 3
## at seed 4
## done update min distance
## seed range 200 300
## picked up 319 dist was 0.63195
## add new core from 319 to 4
## KMEans: reassign after init
## KMEans: iter 0
## KMEans: iter 1 changed 0

The returned list contains 3 fields:

names(km)

## [1] "centers" "cluster" "size"

km$centers contains a tibble with clust column and the cluster centers:

km$centers

## # A tibble: 5 × 3
##   clust    V1    V2
##   <int> <dbl> <dbl>
## 1     1  1.04  1.05
## 2     2  2.00  2.04
## 3     3  4.00  3.87
## 4     4  5.07  4.97
## 5     5  2.99  2.98

clusters are numbered according to order_func (see ‘Custom cluster ordering’ section).

km$cluster contains tibble with id column with the observation id (1:n if no id column was supplied), and clust column with the observation assigned cluster:

km$cluster

## # A tibble: 500 × 2
##    id    clust
##    <chr> <int>
##  1 1         1
##  2 2         1
##  3 3         1
##  4 4         1
##  5 5         1
##  6 6         1
##  7 7         1
##  8 8         1
##  9 9         1
## 10 10        1
## # … with 490 more rows

km$size contains tibble with clust column and n column with the number of points in each cluster:

km$size

## # A tibble: 5 × 2
##   clust     n
##   <int> <int>
## 1     1   100
## 2     2   100
## 3     3    99
## 4     4   104
## 5     5    97

We can now check our clustering performance - fraction of observations that were classified correctly (Note that match_clusters function is internal to the package and is used only in this vignette):

d <- tglkmeans:::match_clusters(data, km, 5)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))

## [1] 0.978

And plot the results:

d %>% ggplot(aes(x=V1, y=V2, color=factor(new_clust), shape=factor(true_clust))) + 
    geom_point() + 
    scale_color_discrete(name='cluster') + 
    scale_shape_discrete(name='true cluster') + 
    geom_point(data=km$centers, size=7, color='black', shape='X')

Custom cluster ordering

By default, the clusters where ordered using the following function: hclust(dist(cor(t(centers)))) - hclust of the euclidean distance of the correlation matrix of the centers.

We can supply our own function to order the clusters using reorder_func argument. The function would be applied to each center and he clusters would be ordered by the result.

km <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
              k=5, 
              metric='euclid', 
              verbose=FALSE, 
              reorder_func=median)
km$centers

## # A tibble: 5 × 3
##   clust    V1    V2
##   <int> <dbl> <dbl>
## 1     1  1.04 0.999
## 2     2  2.02 1.97 
## 3     3  2.99 2.97 
## 4     4  3.98 3.93 
## 5     5  5.00 4.99

Missing data

tglkmeans can deal with missing data, as long as at least one dimension is not missing. for example:

data$V1[sample(1:nrow(data), round(nrow(data)*0.2))] <- NA
data

## # A tibble: 500 × 4
##       id    V1    V2 true_clust
##    <int> <dbl> <dbl>      <int>
##  1     1 0.812 1.46           1
##  2     2 1.02  1.22           1
##  3     3 1.39  1.31           1
##  4     4 1.42  0.891          1
##  5     5 0.914 1.48           1
##  6     6 0.263 0.942          1
##  7     7 0.613 1.04           1
##  8     8 1.27  1.05           1
##  9     9 0.743 0.758          1
## 10    10 1.14  0.887          1
## # … with 490 more rows

km <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
              k=5, 
              metric='euclid', 
              verbose=FALSE)
d <- tglkmeans:::match_clusters(data, km, 5)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))

## [1] 0.97

and plotting the results (without the NA’s) we get:

d %>% ggplot(aes(x=V1, y=V2, color=factor(new_clust), shape=factor(true_clust))) + 
    geom_point() + 
    scale_color_discrete(name='cluster') + 
    scale_shape_discrete(name='true cluster') + 
    geom_point(data=km$centers, size=7, color='black', shape='X')

## Warning: Removed 100 rows containing missing values (geom_point).

High dimensions

Let’s move to higher dimensions (and higher noise):

data <- simulate_data(n=100, sd=0.3, nclust=30, dims=300)
km <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
    k=30, 
    metric='euclid', 
    verbose=FALSE)

d <- tglkmeans:::match_clusters(data, km, 30)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))

## [1] 1

Comparison with R vanilla kmeans

Let’s compare it to R vanilla kmeans:

km_standard <- kmeans(data %>% select(starts_with('V')), 30)
km_standard$clust <- tibble(id = 1:nrow(data), clust=km_standard$cluster)

d <- tglkmeans:::match_clusters(data, km_standard, 30)
sum(d$true_clust == d$new_clust, na.rm=TRUE) / sum(!is.na(d$new_clust))

## [1] 0.7142857

We can see that kmeans++ clusters significantly better than R vanilla kmeans.

Random seed

we can set the seed for the c++ random number generator, for reproducible results:

km1 <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
               k=30, 
               metric='euclid', 
               verbose=FALSE, 
               seed = 60427)
km2 <- TGL_kmeans_tidy(data %>% select(id, starts_with('V')), 
               k=30, 
               metric='euclid', 
               verbose=FALSE, 
               seed = 60427)
all(km1$centers[, -1] == km2$centers[, -1])

## [1] TRUE