User Tools

Site Tools


k-means_clustering_using_r

Load libraries

library(caret)
library(ykmeans)
library(ggplot2)
library(plyr)
library(fmsb)

Load data

sample <- read.csv("./clusters.csv")

clean the first column containing text, check for “near zero values”

sample <- sample[,-1]
nzv <- nearZeroVar(sample)
#sample.df <- sample.df[,-nzv] # if nzv contains any number
nzv

Create a data matrix

sample1 <- data.matrix(sample)
samplecor <- cor(sample1)
samplehighcor <- findCorrelation(samplecor, cutoff=.7)

Remove zero correlation

samplenocor <- sample1[,-samplehighcor]

Remove weakly correlated through PCA

samplenocorbase <- prcomp(samplenocor, scale=T)

Clustering, from 3 to 6 clusters

samplepca <- data.frame(samplenocorbase$x)
keys <- names(samplepca)
samplekm <- ykmeans(samplepca, keys, "PC1", 3:6)

Check the deviation to infer the number of clusters

table(samplekm$cluster)

Plot the clusters!

samplekm <- ykmeans(samplepca, keys, "PC1", 6)
ggplot(samplekm, aes(x=PC1, y=PC2, col=as.factor(cluster), shape=as.factor(cluster))) + geom_point()

Add the cluster column

sample$cluster <- samplekm$cluster

Basically, do the pivot, using average, on cluster. (I could have melted and casted data.

samplecenter <- aggregate(sample, by=list(sample$cluster), FUN=mean)
samplecenter$cluster <- NULL 
samplecenter$Group.1 <- NULL

Add a radar chart, but create a base data.frame beforehand.

createRaderChartDataFrame <- function(df) {
  df <- data.frame(df)
  dfmax <- apply(df, 2, max) + 1
  dfmin <- apply(df, 2, min) - 1
  as.data.frame(rbind(dfmax, dfmin, df))
}
 
sampleradar <- createRaderChartDataFrame(scale(samplecenter))

Plot the radarchart.

##par(family="HiraKakuProN-W3")
radarchart(sampleradar, seg=5, plty=4, plwd=4, pcol=rainbow(5))
legend("topright", legend=1:6, col=rainbow(5), lty=1:7)

k-means_clustering_using_r.txt · Last modified: 2015/07/24 15:25 by vincenzo