dim(iris)
## [1] 150 5
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
class(iris)
## [1] "data.frame"
# set up plots
par(mfrow=c(2,2))
# plot original data
speciesColors = sapply(as.character(iris$Species), switch, setosa = "red", versicolor = "blue", virginica="green")
data.mat <- cbind(iris$Petal.Length, iris$Sepal.Width, iris$Species)
colnames(data.mat) <- c("Petal.Length", "Sepal.Width", "Species")
plot(data.mat[,-3], col=speciesColors, main="orignal data with class information")
# apply k-means with k=2
set.seed(1)
km.out2 <- kmeans(data.mat[,-3], centers=2)
plot(data.mat[,-3], col=(km.out2$cluster+1), main="k-means clustering results with k=2", xlab="f1", ylab="f2", pch=20, cex=2)
# apply k-means with k=3
set.seed(1)
km.out3 <- kmeans(data.mat[,-3], centers=3)
plot(data.mat[,-3], col=(km.out3$cluster+1), main="k-means clustering results with k=3", xlab="f1", ylab="f2", pch=20, cex=2)
# apply k-means with k=4
set.seed(1)
km.out4 <- kmeans(data.mat[,-3], centers=4)
plot(data.mat[,-3], col=(km.out4$cluster+1), main="k-means clustering results with k=4", xlab="f1", ylab="f2", pch=20, cex=2)
# what is the output from k-means clustering?
km.out2
## K-means clustering with 2 clusters of sizes 51, 99
##
## Cluster means:
## Petal.Length Sepal.Width
## 1 1.492157 3.409804
## 2 4.925253 2.875758
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 11.68196 74.62869
## (between_SS / total_SS = 82.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# check between and within cluster sum of squares.
km.out2$betweenss
## [1] 406.3217
km.out2$tot.withinss
## [1] 86.31065
km.out3$betweenss
## [1] 451.8953
km.out3$tot.withinss
## [1] 40.73707
km.out4$betweenss
## [1] 465.1476
km.out4$tot.withinss
## [1] 27.4847
notice that we use the membership score to control the size of the point to plot. k-means clustering algorithm will not create membership score and will create a hard clustering whereas c-means gives membership score which can be used as a confidence meansure of clustering for each sample.
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
# set up plots
par(mfrow=c(2,2))
# apply c-means with k=1
set.seed(1)
cm.out2 <- cmeans(data.mat[,-3], centers=2, m = 2)
plot(data.mat[,-3], col=(cm.out2$cluster+1), main="c-means clustering results with k=2", xlab="f1", ylab="f2", pch=20, cex=apply(cm.out2$membership+0.5, 1, max)^2)
# apply c-means with k=3
set.seed(1)
cm.out3 <- cmeans(data.mat[,-3], centers=3)
plot(data.mat[,-3], col=(cm.out3$cluster+1), main="c-means clustering results with k=3", xlab="f1", ylab="f2", pch=20, cex=apply(cm.out2$membership+0.5, 1, max)^2)
# apply c-means with k=4
set.seed(1)
cm.out4 <- cmeans(data.mat[,-3], centers=4)
plot(data.mat[,-3], col=(cm.out4$cluster+1), main="c-means clustering results with k=4", xlab="f1", ylab="f2", pch=20, cex=apply(cm.out2$membership+0.5, 1, max)^2)
# (1)
library(ClueR)
## Loading required package: parallel
data(adipocyte)
# (2)
adipocyte.exp <- adipocyte[,-ncol(adipocyte)]
idx <- which(rowSums(abs(adipocyte.exp) > 1) > 0)
adipocyte.reg <- adipocyte.exp[idx,]
# number of genes before filtering
nrow(adipocyte.exp)
## [1] 24213
# number of genes after filtering
nrow(adipocyte.reg)
## [1] 2742
# number of time points in the data
ncol(adipocyte.reg)
## [1] 7
# (3)
standardize <- function(mat) {
means <- apply(mat, 1, mean)
stds <- apply(mat, 1, sd)
tmp <- sweep(mat, 1, means, FUN="-")
mat.stand <- sweep(tmp, 1, stds, FUN="/")
return(mat.stand)
}
adipocyte.scaled <- standardize(adipocyte.reg)
# (4)
set.seed(1)
km <- kmeans(adipocyte.scaled, centers = 13)
# (5)
km.gene.clustered <- names(which(km$cluster == km$cluster["ISL1"]))
# (6)
library(igraph)
## Warning: package 'igraph' was built under R version 3.4.4
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
mat <- cor(t(adipocyte.scaled[km.gene.clustered,]))
mat[mat<0.99]=0
# Make an Igraph object from this matrix:
network <- graph_from_adjacency_matrix(mat, weighted=T, mode="undirected", diag=F)
V(network)$label.cex <- 0.5
l <- layout_on_sphere(network)
plot(network, layout=l, cex=1, vertex.size=3)