Started playing around a bit with clustering
This commit is contained in:
parent
00a6f19cf6
commit
98e60a8e46
122
code/04_clustering_haum.R
Normal file
122
code/04_clustering_haum.R
Normal file
@ -0,0 +1,122 @@
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
# Read data
|
||||
|
||||
dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv",
|
||||
sep = ";", header = TRUE)
|
||||
dat0$date <- as.Date(dat0$date)
|
||||
dat0$date.start <- as.POSIXct(dat0$date.start)
|
||||
dat0$date.stop <- as.POSIXct(dat0$date.stop)
|
||||
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
||||
|
||||
# Preprocess variables for clustering
|
||||
|
||||
str(dat0)
|
||||
|
||||
# year --> lubridate::year()
|
||||
# duration --> numeric, remove NA
|
||||
# topicNumber --> numeric, remove NA
|
||||
# distance --> numeric, remove NA
|
||||
# scaleSize --> numeric, remove NA
|
||||
# rotationDegree --> numeric, remove NA
|
||||
# holiday --> one/hot coding
|
||||
# vacations --> one/hot coding
|
||||
# artwork? --> one/hot coding (72 new variables)
|
||||
# event? --> one/hot coding (4 new variables)
|
||||
|
||||
dat <- dat0
|
||||
|
||||
dat$year <- lubridate::year(dat$date)
|
||||
dat$holiday1 <- ifelse(is.na(dat$holiday), 0, 1)
|
||||
dat$vacations1 <- ifelse(is.na(dat$vacations), 0, 1)
|
||||
dat$topicNumber1 <- ifelse(is.na(dat$topicNumber), 0, dat$topicNumber)
|
||||
dat$duration1 <- ifelse(is.na(dat$duration), 0, dat$duration)
|
||||
dat$distance1 <- ifelse(is.na(dat$distance), 0, dat$distance)
|
||||
dat$scaleSize1 <- ifelse(is.na(dat$scaleSize), 0, dat$scaleSize)
|
||||
dat$rotationDegree1 <- ifelse(is.na(dat$rotationDegree), 0, dat$rotationDegree)
|
||||
|
||||
for (artwork in unique(dat$artwork)) {
|
||||
dat[[paste0("A", artwork)]] <- ifelse(dat$artwork == artwork, 1, 0)
|
||||
}
|
||||
|
||||
for (event in unique(dat$event)) {
|
||||
dat[[event]] <- ifelse(dat$event == event, 1, 0)
|
||||
}
|
||||
|
||||
mat <- dat[, c("year", "duration", "topicNumber", "distance", "scaleSize",
|
||||
"rotationDegree", "holiday1", "vacations1",
|
||||
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
|
||||
"openPopup")]
|
||||
|
||||
|
||||
mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1",
|
||||
"rotationDegree1", "holiday1", "vacations1",
|
||||
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
|
||||
"openPopup")]
|
||||
|
||||
library(cluster) # for hiereachical clustering
|
||||
|
||||
k1 <- kmeans(mat1, 2)
|
||||
dat$kcluster <- k1$cluster
|
||||
|
||||
mat1$artwork <- dat$artwork
|
||||
datagg <- aggregate(. ~ artwork, mat1, mean)
|
||||
aa <- datagg$artwork
|
||||
datagg$artwork <- NULL
|
||||
|
||||
k2 <- kmeans(datagg, 3)
|
||||
datagg$cluster <- k2$cluster
|
||||
datagg <- datagg[order(datagg$cluster), ]
|
||||
aggregate(cbind(duration1, distance1, scaleSize1, rotationDegree1,
|
||||
holiday1, vacations1) ~ cluster, datagg, mean)
|
||||
# --> how to interpret this??
|
||||
|
||||
|
||||
# sample data for hierarchical clustering
|
||||
n <- 200
|
||||
set.seed(1826)
|
||||
|
||||
mat2 <- mat1[sample(nrow(mat1), n), ]
|
||||
rownames(mat2) <- NULL
|
||||
a1 <- agnes(mat2)
|
||||
|
||||
d1 <- as.dendrogram(a1)
|
||||
plot(d1)
|
||||
|
||||
datagg$cluster <- NULL
|
||||
rownames(datagg) <- NULL
|
||||
a2 <- agnes(datagg)
|
||||
d2 <- as.dendrogram(a2)
|
||||
plot(d2)
|
||||
|
||||
## Clustering for nominal features with nomclust package
|
||||
|
||||
library(nomclust)
|
||||
|
||||
dat <- as.data.frame(lapply(dat0[, c("folder", "holiday", "vacations", "artwork",
|
||||
"event", "case", "trace")], as.factor))
|
||||
mat <- list()
|
||||
mat$year <- as.numeric(dat$folder)
|
||||
mat$holiday <- as.numeric(dat$holiday)
|
||||
mat$vacations <- as.numeric(dat$vacations)
|
||||
mat$artwork <- as.numeric(dat$artwork)
|
||||
mat$event <- as.numeric(dat$event)
|
||||
mat$case <- as.numeric(dat$case)
|
||||
mat$trace <- as.numeric(dat$trace)
|
||||
|
||||
mat$holiday <- ifelse(is.na(mat$holiday), 0, 1)
|
||||
mat$vacations <- ifelse(is.na(mat$vacations), 0, 1)
|
||||
|
||||
set.seed(1526)
|
||||
ids <- sample(nrow(mat), 1000)
|
||||
mat_small <- mat[ids, ]
|
||||
|
||||
n1 <- nomclust(mat_small)
|
||||
|
||||
n1$mem$clu_3
|
||||
dend.plot(n1, clusters = 3)
|
||||
|
||||
mat_small[n1$mem$clu_6 == 6, ]
|
||||
|
||||
cbind(mat_small[order(n1$mem$clu_3), ], n1$mem$clu_3[order(n1$mem$clu_3)])
|
||||
|
Loading…
Reference in New Issue
Block a user