From 98e60a8e46146a58b0979421f3816c3e802f7676 Mon Sep 17 00:00:00 2001 From: nwickel Date: Thu, 28 Sep 2023 16:03:06 +0200 Subject: [PATCH] Started playing around a bit with clustering --- code/04_clustering_haum.R | 122 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 code/04_clustering_haum.R diff --git a/code/04_clustering_haum.R b/code/04_clustering_haum.R new file mode 100644 index 0000000..7161c3d --- /dev/null +++ b/code/04_clustering_haum.R @@ -0,0 +1,122 @@ +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +# Read data + +dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", + sep = ";", header = TRUE) +dat0$date <- as.Date(dat0$date) +dat0$date.start <- as.POSIXct(dat0$date.start) +dat0$date.stop <- as.POSIXct(dat0$date.stop) +dat0$artwork <- sprintf("%03d", dat0$artwork) + +# Preprocess variables for clustering + +str(dat0) + +# year --> lubridate::year() +# duration --> numeric, remove NA +# topicNumber --> numeric, remove NA +# distance --> numeric, remove NA +# scaleSize --> numeric, remove NA +# rotationDegree --> numeric, remove NA +# holiday --> one/hot coding +# vacations --> one/hot coding +# artwork? --> one/hot coding (72 new variables) +# event? --> one/hot coding (4 new variables) + +dat <- dat0 + +dat$year <- lubridate::year(dat$date) +dat$holiday1 <- ifelse(is.na(dat$holiday), 0, 1) +dat$vacations1 <- ifelse(is.na(dat$vacations), 0, 1) +dat$topicNumber1 <- ifelse(is.na(dat$topicNumber), 0, dat$topicNumber) +dat$duration1 <- ifelse(is.na(dat$duration), 0, dat$duration) +dat$distance1 <- ifelse(is.na(dat$distance), 0, dat$distance) +dat$scaleSize1 <- ifelse(is.na(dat$scaleSize), 0, dat$scaleSize) +dat$rotationDegree1 <- ifelse(is.na(dat$rotationDegree), 0, dat$rotationDegree) + +for (artwork in unique(dat$artwork)) { + dat[[paste0("A", artwork)]] <- ifelse(dat$artwork == artwork, 1, 0) +} + +for (event in unique(dat$event)) { + dat[[event]] <- ifelse(dat$event == event, 1, 0) +} + +mat <- dat[, c("year", "duration", "topicNumber", "distance", "scaleSize", + "rotationDegree", "holiday1", "vacations1", + paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", + "openPopup")] + + +mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1", + "rotationDegree1", "holiday1", "vacations1", + paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", + "openPopup")] + +library(cluster) # for hiereachical clustering + +k1 <- kmeans(mat1, 2) +dat$kcluster <- k1$cluster + +mat1$artwork <- dat$artwork +datagg <- aggregate(. ~ artwork, mat1, mean) +aa <- datagg$artwork +datagg$artwork <- NULL + +k2 <- kmeans(datagg, 3) +datagg$cluster <- k2$cluster +datagg <- datagg[order(datagg$cluster), ] +aggregate(cbind(duration1, distance1, scaleSize1, rotationDegree1, + holiday1, vacations1) ~ cluster, datagg, mean) +# --> how to interpret this?? + + +# sample data for hierarchical clustering +n <- 200 +set.seed(1826) + +mat2 <- mat1[sample(nrow(mat1), n), ] +rownames(mat2) <- NULL +a1 <- agnes(mat2) + +d1 <- as.dendrogram(a1) +plot(d1) + +datagg$cluster <- NULL +rownames(datagg) <- NULL +a2 <- agnes(datagg) +d2 <- as.dendrogram(a2) +plot(d2) + +## Clustering for nominal features with nomclust package + +library(nomclust) + +dat <- as.data.frame(lapply(dat0[, c("folder", "holiday", "vacations", "artwork", + "event", "case", "trace")], as.factor)) +mat <- list() +mat$year <- as.numeric(dat$folder) +mat$holiday <- as.numeric(dat$holiday) +mat$vacations <- as.numeric(dat$vacations) +mat$artwork <- as.numeric(dat$artwork) +mat$event <- as.numeric(dat$event) +mat$case <- as.numeric(dat$case) +mat$trace <- as.numeric(dat$trace) + +mat$holiday <- ifelse(is.na(mat$holiday), 0, 1) +mat$vacations <- ifelse(is.na(mat$vacations), 0, 1) + +set.seed(1526) +ids <- sample(nrow(mat), 1000) +mat_small <- mat[ids, ] + +n1 <- nomclust(mat_small) + +n1$mem$clu_3 +dend.plot(n1, clusters = 3) + +mat_small[n1$mem$clu_6 == 6, ] + +cbind(mat_small[order(n1$mem$clu_3), ], n1$mem$clu_3[order(n1$mem$clu_3)]) +