Estimate best K for K-Means in parallel

Gap statistic is often used to determine the best number of clusters. Please see a local version implementation for gap statistic here: https://github.com/echen/gap-statistic.

It is often desired to parallelize such tedious job to boost the speed. I implement a parallelized version basd on the source code:

library(plyr)
library(ggplot2)

# Calculate log(sum_i(within-cluster_i sum of squares around cluster_i mean)).
dispersion = function(data, num_clusters) {
  # R's k-means algorithm doesn't work when there is only one cluster.
  if (num_clusters == 1) {
    cluster_mean = aaply(data, 2, mean)
    distances_from_mean = aaply((data - cluster_mean)^2, 1, sum)
    log(sum(distances_from_mean))
  } else {	
    # Run the k-means algorithm `nstart` times. Each run uses at most `iter.max` iterations.
    k = kmeans(data, centers = num_clusters, nstart = 5, iter.max = 50)
    # Take the sum, over each cluster, of the within-cluster sum of squares around the cluster mean. Then take the log. This is `W_k` in TWH's notation.
    log(sum(k$withinss))
  }
}

# For an appropriate reference distribution (in this case, uniform points in the same range as `data`), simulate the mean and standard deviation of the dispersion.
reference_dispersion = function(data, num_clusters, num_reference_bootstraps) {
  dispersions = maply(1:num_reference_bootstraps, function(i) dispersion(generate_uniform_points(data), num_clusters))
  mean_dispersion = mean(dispersions)
  stddev_dispersion = sd(dispersions) / sqrt(1 + 1 / num_reference_bootstraps) # the extra factor accounts for simulation error
  c(mean_dispersion, stddev_dispersion)
}


# Given a matrix `data`, where rows are observations and columns are individual dimensions, compute and plot the gap statistic (according to a uniform reference distribution).
gap_statistic = function(data, min_num_clusters = 1, max_num_clusters = 10, num_reference_bootstraps = 10) {
  # register parallelism backend
  library(doMC)
  registerDoMC(4)
  
  num_clusters = min_num_clusters:max_num_clusters
  actual_dispersions = maply(num_clusters, function(n) dispersion(data, n), .parallel = TRUE, 
                             .paropts = list(.export=c("dispersion"),
                                             .packages=c("plyr")))
  ref_dispersions = maply(num_clusters, function(n) reference_dispersion(data, n, num_reference_bootstraps), 
                          .parallel = TRUE,
                          .paropts = list(.export=c("dispersion", "reference_dispersion"),
                                         .packages=c("plyr")))
  mean_ref_dispersions = ref_dispersions[ , 1]
  stddev_ref_dispersions = ref_dispersions[ , 2]
  gaps = mean_ref_dispersions - actual_dispersions
  
  print(plot_gap_statistic(gaps, stddev_ref_dispersions, num_clusters))
  
  print(paste("The estimated number of clusters is ", num_clusters[which.max(gaps)], ".", sep = ""))
  
  list(gaps = gaps, gap_stddevs = stddev_ref_dispersions)
}

# Plot the gaps along with error bars.
plot_gap_statistic = function(gaps, stddevs, num_clusters) {
  qplot(num_clusters, gaps, xlab = "# clusters", ylab = "gap", geom = "line", main = "Estimating the number of clusters via the gap statistic") + geom_errorbar(aes(num_clusters, ymin = gaps - stddevs, ymax = gaps + stddevs), size = 0.3, width = 0.2, colour = "darkblue")
}

# Generate uniform points within the range of `data`.
generate_uniform_points = function(data) {
  # Find the min/max values in each dimension, so that we can generate uniform numbers in these ranges.
  mins = aaply(data, 2, min)
  maxs = apply(data, 2, max)
  
  num_datapoints = nrow(data)
  # For each dimension, generate `num_datapoints` points uniformly in the min/max range.
  uniform_pts = maply(1:length(mins), function(dim) runif(num_datapoints, min = mins[dim], max = maxs[dim]))
  uniform_pts = t(uniform_pts)
}

 

It uses `doMC` as parallelism backend. You can change how many cores it uses in line 32.

 

Leave a comment

Your email address will not be published. Required fields are marked *