Skip to content

Latest commit

 

History

History
158 lines (117 loc) · 4.9 KB

README.md

File metadata and controls

158 lines (117 loc) · 4.9 KB

UnsupLearnR

The goal of UnsupLearnR is to assist with unsupervised learning in R - particularly with visualisation.

Installation

You can install the development version of UnsupLearnR like so:

# install pacman
if (!require("pacman")) install.packages("pacman")
# install dependencies from CRAN
pacman::p_install(devtools, ggplot2, ggpubr)
# install graphicsPLr from GitHub
if (!require("graphicsPLr")) devtools::install_github('Phuong-Le/graphicsPLr')

# the package
devtools::install_github('Phuong-Le/UnsupLearnR')

Example

I will use the iris data as a demonstration

PCA

library(UnsupLearnR)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(stats)
library(factoextra)
#> Loading required package: ggplot2
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

data(iris)

# let's briefly have a look at the iris data
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa

# let's do a PCA on Sepal.Length, Sepal.Width and Petal.Length (ie leave out species)
iris_numeric = select(iris, -c(Species))

## performing PCA, 
## the `scale. = T` option scales your data to standard normal distribution (mean=0, var=1) - this is generally recommended as the categories on PCA are not on the same units/scales
pca = prcomp(iris_numeric, scale. = T)
pca_ed = as.data.frame(pca$x)

# let's colour the PCA plot by iris species
# first create a param dataframe - the column to be coloured has to be named "param"
params = data.frame(param=iris$Species)

plot_pca(pca_ed, params = params)

# it's worth looking at the loadings plot, let's use fviz_pca_var from factoextra package (I think it does a really good job so I don't think I need to write a similar function in UnsupLearnR)
library(ggpubr)
fviz_pca_var(pca) +
  theme_pubr()

# you can also choose the dimensions (principle components) on the x and y axis 
plot_pca(pca_ed, param = params, x = 2, y= 3)

# now PCA preserves the number of dimensions, as we start with 4 variables (columns), we should now have 4 principle components 
# how do you know how many dimensions to look at? let's check the scree plot (we should have done this before even looking at the pca plots themselves)
# The scree plot shows that the first 2 dimensions already captured almost 100% of the variation (ie information) of the data - 
# so looking at the last 2 dimensions won't add much ore values
scree_plot(pca_ed)

UnsupLearnR also helps with clustering - note that this is designed for the function in the stats package

# let's do kmeans clustering
# let's set K=3 (because it's quite obvious on the pca plot), # let's use 30 initialisations because the first step of kmeans is a randomisation step
# and set a limit to the number of iterations at 20
k = 3
set.seed(2510)
cluster = make_cluster(iris_numeric, clusterFUN = kmeans, centers = k, nstart = 30, iter.max = 20)
set.seed(NULL)

# now that we have the cluster object, let's see how species are assigned to clusters
plot_labelled_cluster_bars(cluster = cluster, param = params)

# and map the clustering result to the PCA plot

plot_pca(pca_ed, params, cluster = cluster)

What if it’s unclear how many K clusters to choose? Let’s examine a range

library(ggplot2)
set.seed(2510)

n_clusters = 15
clusters = list()
for (i in 1:n_clusters) {
  clusters[[i]] = make_cluster(iris_numeric, kmeans, centers = i, nstart = 30, iter.max = 20)
}
set.seed(NULL)

# let's look at the total variation within clusters plots 
# total variation within clusters measures how far away datapoints are from their centers
# as K increases, it always decreases
# A good K is one where you see a kink
# here the decrease from 3 to 4 is tiny compared to 2 to 3, suggesting that 3 is a good K value
tot_within_var_plot(clusters = clusters) + 
  scale_x_continuous(n.breaks = n_clusters)