Hierarchical
Clustering
Using Hclust
Clustering: Hierarchical Clustering
#read the data
reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500)
View(reviews)
names(reviews)
reviews1<-data.frame(reviews$reviews.text)
names(reviews1)
dim(reviews1)
names(reviews1)[1]<-"reviews"
#to remove emojis
#reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII')
Rupak Roy
Clustering: Hierarchical Clustering
#Build a Text Corpus
library(tm)
review.corpus<-Corpus(VectorSource(reviews1$reviews))
summary(review.corpus)
inspect(review.corpus[1:5]) #Inspecting elements in Corpus
#it will replace non-convertible bytes in the Corpus with strings showing their hex codes
#Especially the emojis which throws error like invalid input in 'utf8towcs'.
review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
#or
#review.corpus <- tm_map(review.corpus, PlainTextDocument)
#or define in the stop words
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
#Data Transformations -Cleaning
#Converting to lower case
review.corpus<-tm_map(review.corpus,tolower)
#Removing extra white space
review.corpus<-tm_map(review.corpus,stripWhitespace)
#Removing punctuations
review.corpus<-tm_map(review.corpus,removePunctuation)
#Removing numbers
review.corpus<-tm_map(review.corpus,removeNumbers)
#Can add more words apart from standard list
my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s')
review.corpus<-tm_map(review.corpus,removeWords,my_stopwords)
Rupak Roy
Clustering: Hierarchical Clustering
#Build term document matrix
review.tdm<-TermDocumentMatrix(review.corpus)
review.tdm
dim(review.tdm) #Dimensions of term document matrix
inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix
#Removing sparse terms(Words that occur infrequenctly)
#here 97% refers remove at least 97% of sparse
review.imp<-removeSparseTerms(review.tdm,0.97)
review.imp
inspect(review.imp[1:10,1:10])
review.matrix<-as.matrix(review.imp)
#-----------Hclust-----------------------------------
#Measure the distance between the words/terms(as we know in clustering we need
the distance between the data points to group)
distmatrix<-dist(scale(review.matrix),method="euclidean")
#Apply hierarchcal clustering
review.h<-hclust(distmatrix,method="ward.D2")
Rupak Roy
Clustering: Hierarchical Clustering
#plot dendograph which represents the hierarchical structure of clusters
plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot")
rect.hclust(review.h,5)
library(ggdendro)
ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE)
#where hang=-1 to put the labels at the same height
# load code of A2R function
source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down =
c("green","blue", "black","red","yellow","orange","brown"))
#Triangle plot
p<-as.dendrogram(review.h)
plot(p, type = "triangle", ylab = "Height")
rect.hclust(review.h,5)
# Zoom to the first dendrogram
plot(p, xlim = c(88, 92), ylim = c(1,74))
#ylim = the height
#xlim= is the position values of the labels we can get the values in review.h$labels
review.h$labels
Rupak Roy
Clustering: Hierarchical Clustering
# Change edge color
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
plot(p, xlab = "Height", nodePar = nodePar,
edgePar = list(col = 4:3, lwd = 2:1))
#nodePar: a list of plotting parameters to use for the nodes (see
?points). Default value is NULL. The list may contain components named
pch, cex, col, xpd, and/or bg each of which can have length two for
specifying separate attributes for inner nodes and leaves.
#edgePar: a list of plotting parameters to use for the edge segments
(see ?segments). The list may contain components named col, lty and
lwd (for the segments). As with nodePar, each can have length two for
differentiating leaves and inner nodes.
#leaflab: a string specifying how leaves are labeled. The default
"perpendicular" write text vertically; "textlike" writes text horizontally (in a
rectangle), and "none" suppresses leaf labels.
Rupak Roy
Clustering: Hierarchical Clustering
#Phylogenetic plots can be used to produce a more sophisticated dendrogram.
# install.packages("ape")
library("ape")
# Default plot
plot(as.phylo(review.h))
#Same code mentioned over the clustering chapter of machine learning course
#it will show error as the label values are stored in factor
#crimeHclust1<-crimeHclust
#str(cimeHClust1)
#crimeHclust1$labels<-as.character(crimeHclust1$labels)
plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5)
# Cladogram
plot(as.phylo(review.h), type = "cladogram", cex = 0.6,
label.offset = 0.5)
Rupak Roy
Clustering: Hierarchical Clustering
# Unrooted
plot(as.phylo(review.h), type = "unrooted", cex = 0.6,
no.margin = TRUE)
# Fan
plot(as.phylo(review.h), type = "fan")
# Radial
plot(as.phylo(review.h), type = "radial")
# Group the Fan type into 5 clusters
colors = c("red", "blue", "green", "black")
c = cutree(review.h, 5)
plot(as.phylo(review.h), type = "fan", tip.color = colors[c],
label.offset = 1, cex = 0.7)
Rupak Roy

Hierarchical Clustering - Text Mining/NLP

  • 1.
  • 2.
    Clustering: Hierarchical Clustering #readthe data reviews<-read.csv(file.choose(),stringsAsFactors=FALSE,nrows = 500) View(reviews) names(reviews) reviews1<-data.frame(reviews$reviews.text) names(reviews1) dim(reviews1) names(reviews1)[1]<-"reviews" #to remove emojis #reviews1 <- iconv(reviews1, 'UTF-8', 'ASCII') Rupak Roy
  • 3.
    Clustering: Hierarchical Clustering #Builda Text Corpus library(tm) review.corpus<-Corpus(VectorSource(reviews1$reviews)) summary(review.corpus) inspect(review.corpus[1:5]) #Inspecting elements in Corpus #it will replace non-convertible bytes in the Corpus with strings showing their hex codes #Especially the emojis which throws error like invalid input in 'utf8towcs'. review.corpus<-tm_map(review.corpus, function(x) iconv(enc2utf8(x), sub = "byte")) #or #review.corpus <- tm_map(review.corpus, PlainTextDocument) #or define in the stop words my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s') #Data Transformations -Cleaning #Converting to lower case review.corpus<-tm_map(review.corpus,tolower) #Removing extra white space review.corpus<-tm_map(review.corpus,stripWhitespace) #Removing punctuations review.corpus<-tm_map(review.corpus,removePunctuation) #Removing numbers review.corpus<-tm_map(review.corpus,removeNumbers) #Can add more words apart from standard list my_stopwords<-c(stopwords('english'),'@','http*','url','www*','itã¢â‚¬å¡ãƒâ€žãƒâ´s') review.corpus<-tm_map(review.corpus,removeWords,my_stopwords) Rupak Roy
  • 4.
    Clustering: Hierarchical Clustering #Buildterm document matrix review.tdm<-TermDocumentMatrix(review.corpus) review.tdm dim(review.tdm) #Dimensions of term document matrix inspect(review.tdm[1:10,1:10]) #Inspecting the term document matrix #Removing sparse terms(Words that occur infrequenctly) #here 97% refers remove at least 97% of sparse review.imp<-removeSparseTerms(review.tdm,0.97) review.imp inspect(review.imp[1:10,1:10]) review.matrix<-as.matrix(review.imp) #-----------Hclust----------------------------------- #Measure the distance between the words/terms(as we know in clustering we need the distance between the data points to group) distmatrix<-dist(scale(review.matrix),method="euclidean") #Apply hierarchcal clustering review.h<-hclust(distmatrix,method="ward.D2") Rupak Roy
  • 5.
    Clustering: Hierarchical Clustering #plotdendograph which represents the hierarchical structure of clusters plot(review.h,cex=0.1,hang=-1,main="Cluster Dendogram Plot") rect.hclust(review.h,5) library(ggdendro) ggdendrogram(review.h, rotate = TRUE, size = 3,hang=-1,cex=0.6,theme_dendro = FALSE) #where hang=-1 to put the labels at the same height # load code of A2R function source("E:/data2dim/Text Mining/datasets/Clustering/A2RplotCode.R") # colored dendrogram op = par(bg = "#EFEFEF") A2Rplot(review.h, k = 5, hang = -1,cex=0.5,boxes = FALSE, col.up = "grey50", col.down = c("green","blue", "black","red","yellow","orange","brown")) #Triangle plot p<-as.dendrogram(review.h) plot(p, type = "triangle", ylab = "Height") rect.hclust(review.h,5) # Zoom to the first dendrogram plot(p, xlim = c(88, 92), ylim = c(1,74)) #ylim = the height #xlim= is the position values of the labels we can get the values in review.h$labels review.h$labels Rupak Roy
  • 6.
    Clustering: Hierarchical Clustering #Change edge color nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), cex = 0.7, col = "blue") plot(p, xlab = "Height", nodePar = nodePar, edgePar = list(col = 4:3, lwd = 2:1)) #nodePar: a list of plotting parameters to use for the nodes (see ?points). Default value is NULL. The list may contain components named pch, cex, col, xpd, and/or bg each of which can have length two for specifying separate attributes for inner nodes and leaves. #edgePar: a list of plotting parameters to use for the edge segments (see ?segments). The list may contain components named col, lty and lwd (for the segments). As with nodePar, each can have length two for differentiating leaves and inner nodes. #leaflab: a string specifying how leaves are labeled. The default "perpendicular" write text vertically; "textlike" writes text horizontally (in a rectangle), and "none" suppresses leaf labels. Rupak Roy
  • 7.
    Clustering: Hierarchical Clustering #Phylogeneticplots can be used to produce a more sophisticated dendrogram. # install.packages("ape") library("ape") # Default plot plot(as.phylo(review.h)) #Same code mentioned over the clustering chapter of machine learning course #it will show error as the label values are stored in factor #crimeHclust1<-crimeHclust #str(cimeHClust1) #crimeHclust1$labels<-as.character(crimeHclust1$labels) plot(as.phylo(review.h), cex = 0.6, label.offset = 0.5) # Cladogram plot(as.phylo(review.h), type = "cladogram", cex = 0.6, label.offset = 0.5) Rupak Roy
  • 8.
    Clustering: Hierarchical Clustering #Unrooted plot(as.phylo(review.h), type = "unrooted", cex = 0.6, no.margin = TRUE) # Fan plot(as.phylo(review.h), type = "fan") # Radial plot(as.phylo(review.h), type = "radial") # Group the Fan type into 5 clusters colors = c("red", "blue", "green", "black") c = cutree(review.h, 5) plot(as.phylo(review.h), type = "fan", tip.color = colors[c], label.offset = 1, cex = 0.7) Rupak Roy