# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") # Read data dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", sep = ";", header = TRUE) dat0$date <- as.Date(dat0$date) dat0$date.start <- as.POSIXct(dat0$date.start) dat0$date.stop <- as.POSIXct(dat0$date.stop) dat0$artwork <- sprintf("%03d", dat0$artwork) # Preprocess variables for clustering str(dat0) # year --> lubridate::year() # duration --> numeric, remove NA # topicNumber --> numeric, remove NA # distance --> numeric, remove NA # scaleSize --> numeric, remove NA # rotationDegree --> numeric, remove NA # holiday --> one/hot coding # vacations --> one/hot coding # artwork? --> one/hot coding (72 new variables) # event? --> one/hot coding (4 new variables) dat <- dat0 dat$year <- lubridate::year(dat$date) dat$holiday1 <- ifelse(is.na(dat$holiday), 0, 1) dat$vacations1 <- ifelse(is.na(dat$vacations), 0, 1) dat$topicNumber1 <- ifelse(is.na(dat$topicNumber), 0, dat$topicNumber) dat$duration1 <- ifelse(is.na(dat$duration), 0, dat$duration) dat$distance1 <- ifelse(is.na(dat$distance), 0, dat$distance) dat$scaleSize1 <- ifelse(is.na(dat$scaleSize), 0, dat$scaleSize) dat$rotationDegree1 <- ifelse(is.na(dat$rotationDegree), 0, dat$rotationDegree) for (artwork in unique(dat$artwork)) { dat[[paste0("A", artwork)]] <- ifelse(dat$artwork == artwork, 1, 0) } for (event in unique(dat$event)) { dat[[event]] <- ifelse(dat$event == event, 1, 0) } mat <- dat[, c("year", "duration", "topicNumber", "distance", "scaleSize", "rotationDegree", "holiday1", "vacations1", paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", "openPopup")] mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1", "rotationDegree1", "holiday1", "vacations1", paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", "openPopup")] library(cluster) # for hiereachical clustering k1 <- kmeans(mat1, 2) dat$kcluster <- k1$cluster mat1$artwork <- dat$artwork datagg <- aggregate(. ~ artwork, mat1, mean) aa <- datagg$artwork datagg$artwork <- NULL k2 <- kmeans(datagg, 3) datagg$cluster <- k2$cluster datagg <- datagg[order(datagg$cluster), ] aggregate(cbind(duration1, distance1, scaleSize1, rotationDegree1, holiday1, vacations1) ~ cluster, datagg, mean) # --> how to interpret this?? # sample data for hierarchical clustering n <- 200 set.seed(1826) mat2 <- mat1[sample(nrow(mat1), n), ] rownames(mat2) <- NULL a1 <- agnes(mat2) d1 <- as.dendrogram(a1) plot(d1) datagg$cluster <- NULL rownames(datagg) <- NULL a2 <- agnes(datagg) d2 <- as.dendrogram(a2) plot(d2) ## Clustering for nominal features with nomclust package library(nomclust) dat <- as.data.frame(lapply(dat0[, c("folder", "holiday", "vacations", "artwork", "event", "case", "trace")], as.factor)) mat <- list() mat$year <- as.numeric(dat$folder) mat$holiday <- as.numeric(dat$holiday) mat$vacations <- as.numeric(dat$vacations) mat$artwork <- as.numeric(dat$artwork) mat$event <- as.numeric(dat$event) mat$case <- as.numeric(dat$case) mat$trace <- as.numeric(dat$trace) mat$holiday <- ifelse(is.na(mat$holiday), 0, 1) mat$vacations <- ifelse(is.na(mat$vacations), 0, 1) set.seed(1526) ids <- sample(nrow(mat), 1000) mat_small <- mat[ids, ] n1 <- nomclust(mat_small) n1$mem$clu_3 dend.plot(n1, clusters = 3) mat_small[n1$mem$clu_6 == 6, ] cbind(mat_small[order(n1$mem$clu_3), ], n1$mem$clu_3[order(n1$mem$clu_3)])