From ea0660817a8cbf7fc85d1ffbc1c3800f00492287 Mon Sep 17 00:00:00 2001 From: nwickel Date: Mon, 4 Mar 2024 17:32:50 +0100 Subject: [PATCH] More work on trace clustering --- code/09_user-navigation.R | 248 +++++++++++++++++++++++++++++++++----- 1 file changed, 221 insertions(+), 27 deletions(-) diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index 0f03a8d..9c67f74 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -126,11 +126,177 @@ datcase$duration <- datcase$max_time - datcase$min_time datcase$min_time <- NULL datcase$max_time <- NULL + +check_infocards <- function(subdata, artworks) { + infocard_only <- NULL + if(any(unique(subdata$item) %in% artworks)) { + infocard_only <- FALSE + } else { + infocard_only <- TRUE + } + as.numeric(infocard_only) +} + +artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] + +datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks) + + +# Clean up NAs +datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance) +datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize) +datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree) +datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist) +datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details) +datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info) +datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition) +datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks) +datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe) +datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik) +datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema) +datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics) +datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards) + + + cor_mat <- cor(datcase[, -1], use = "pairwise") diag(cor_mat) <- NA heatmap(cor_mat) -# TODO: Add info if all items of a case are information cards?? + +normalize <- function(x) { + (x - min(x)) / (max(x) - min(x)) +} + + +#df <- as.data.frame(lapply(datcase[, -1], normalize)) +df <- as.data.frame(lapply(datcase[, -1], scale)) +#df <- datcase[, -1] + +# "Flatten" with PCA +pc <- prcomp(df) +coor_2d <- as.data.frame(pc$x[, c(1, 2)]) +coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)]) + +plot(coor_2d) +rgl::plot3d(coor_3d) + +#--------------- (2.1) K-Means clustering --------------- + +mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") + +k1 <- kmeans(df, 4) + +grp_km <- k1$cluster +table(grp_km) + +fviz_cluster(list(data = df, cluster = grp_km), + palette = mycols, + ellipse.type = "convex", + show.clust.cent = FALSE, + ggtheme = theme_bw()) + +plot(coor_2d, col = mycols[grp_km]) + +rgl::plot3d(coor_3d, col = mycols[grp_km]) + +aggregate(. ~ grp_km, df, mean) + +#--------------- (2.2) Hierarchical clustering --------------- + +mat <- dist(df) + +h1 <- hclust(mat, method = "average") +h2 <- hclust(mat, method = "complete") +h3 <- hclust(mat, method = "ward.D") +h4 <- hclust(mat, method = "ward.D2") +h5 <- hclust(mat, method = "single") + +# Cophenetic Distances, for each linkage (runs quite some time!) +c1 <- cophenetic(h1) +c2 <- cophenetic(h2) +c3 <- cophenetic(h3) +c4 <- cophenetic(h4) +c5 <- cophenetic(h5) + +# Correlations +cor(mat, c1) +cor(mat, c2) +cor(mat, c3) +cor(mat, c4) +cor(mat, c5) + +# https://en.wikipedia.org/wiki/Cophenetic_correlation +# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering + +hc <- h4 + +# Something like a scree plot (??) +plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) + +k <- 4 + +grp_hclust <- cutree(hc, k = k) + +table(grp_hclust) + +fviz_cluster(list(data = df, cluster = grp_hclust), + palette = mycols, + ellipse.type = "convex", + show.clust.cent = FALSE, + ggtheme = theme_bw()) + +plot(coor_2d, col = mycols[grp_hclust]) +rgl::plot3d(coor_3d, col = mycols[grp_hclust]) + +table(dattree[grp_hclust == 1, "Pattern"]) +table(dattree[grp_hclust == 2, "Pattern"]) +table(dattree[grp_hclust == 3, "Pattern"]) +table(dattree[grp_hclust == 4, "Pattern"]) + + +aggregate(. ~ grp_hclust, df, mean) + + +aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, + nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase, + mean) + +#--------------- (2.3) DBSCAN clustering --------------- + +library(dbscan) +d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1) +hullplot(df, d1) + +grp_db <- d1$cluster +table(grp_db) + +kNNdistplot(df, k = ncol(df)) +abline(h = 0.2, col = "red") +abline(h = 0.06, col = "red") + +fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), + #palette = mycols, + ellipse.type = "convex", + show.clust.cent = FALSE, + ggtheme = theme_bw()) + +mycols <- c("black", mycols) + +plot(coor_2d, col = mycols[grp_db + 1]) +rgl::plot3d(coor_3d, col = mycols[grp_db + 1]) + +aggregate(. ~ grp_db, df, mean) + +table(dattree[grp_db == 0, "Pattern"]) +table(dattree[grp_db == 1, "Pattern"]) +table(dattree[grp_db == 2, "Pattern"]) +table(dattree[grp_db == 3, "Pattern"]) + +# Does not really work with these features! + + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Navigation types by Bousbia et al. (2010): # - Overviewing: this value is close to the Canter “scanning” value. It @@ -181,26 +347,28 @@ heatmap(cor_mat) # * Duration per artwork is low: "ave_duration_item" / datcase$duration dattree <- data.frame(case = datcase$case, - Duration = datcase$duration, + #Duration = datcase$duration, PropItems = datcase$nitems / length(unique(dat$item)), - PropTopic = datcase$nopenTopic / datcase$nflipCard, - PropPopup = datcase$nopenPopup / datcase$nopenTopic, + #PropTopic = datcase$nopenTopic / datcase$nflipCard, + #PropPopup = datcase$nopenPopup / datcase$nopenTopic, # SearchInfo = # 2*(((datcase$nopenPopup / datcase$nopenTopic) * # (datcase$nopenTopic / datcase$nflipCard)) / # ((datcase$nopenPopup / datcase$nopenTopic) + # (datcase$nopenTopic / datcase$nflipCard)) # ), + SearchInfo = datcase$nopenTopic / datcase$nflipCard + + datcase$nopenPopup / datcase$nopenTopic, PropMoves = datcase$nmove / datcase$length, PathLinearity = datcase$nitems / datcase$npaths, Singularity = datcase$npaths / datcase$length ) #dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo) -#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) +dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) -dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic) -dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup) +#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic) +#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup) get_centrality <- function(case, data) { @@ -227,9 +395,10 @@ centrality <- do.call(rbind, centrality) save(centrality, file = "results/haum/tmp_centrality.RData") -dattree$centr_degree <- centrality[, 1] -dattree$centr_degree_loops <- centrality[, 2] -dattree$centr_between <- centrality[, 3] +#dattree$centr_degree <- centrality[, 1] +#dattree$centr_degree_loops <- centrality[, 2] +dattree$DegreeCentrality <- centrality[, 2] +#dattree$BetweenCentrality <- centrality[, 3] ## Add average duration per item @@ -247,7 +416,7 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration rm(tmp) -plot(dattree) +plot(dattree[, -1], pch = ".") par(mfrow = c(3,4)) hist(dattree$Duration, breaks = 50, main = "") @@ -268,13 +437,13 @@ diag(cor_mat) <- NA heatmap(cor_mat) -dattree$Pattern <- "Dispersion" -dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & - dattree$Singularity > 0.8, "Scholar", - dattree$Pattern) -dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & - dattree$centr_between > 0.5, "Star", - dattree$Pattern) +# dattree$Pattern <- "Dispersion" +# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & +# dattree$Singularity > 0.8, "Scholar", +# dattree$Pattern) +# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & +# dattree$centr_between > 0.5, "Star", +# dattree$Pattern) write.table(dattree, file = "results/haum/dattree.csv", @@ -302,11 +471,22 @@ df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) / (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE)) +#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +df <- dattree[, -1] +df$AvDurItem <- normalize(df$AvDurItem) +df$SearchInfo <- normalize(df$SearchInfo) +df$InfocardOnly <- datcase$infocardOnly +summary(df) +#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + # "Flatten" with PCA pc <- prcomp(df) coor_2d <- as.data.frame(pc$x[, c(1, 2)]) coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)]) +plot(coor_2d) +rgl::plot3d(coor_3d) + #--------------- (2.1) K-Means clustering --------------- mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") @@ -326,10 +506,11 @@ plot(coor_2d, col = mycols[grp_km]) rgl::plot3d(coor_3d, col = mycols[grp_km]) +aggregate(. ~ grp_km, df, mean) + #--------------- (2.2) Hierarchical clustering --------------- mat <- dist(df) -# TODO: Do I need to scale all variables? h1 <- hclust(mat, method = "average") h2 <- hclust(mat, method = "complete") @@ -354,7 +535,7 @@ cor(mat, c5) # https://en.wikipedia.org/wiki/Cophenetic_correlation # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering -hc <- h1 +hc <- h4 # Something like a scree plot (??) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) @@ -390,7 +571,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, #--------------- (2.3) DBSCAN clustering --------------- library(dbscan) -d1 <- dbscan(df, eps = .2, minPts = 9) +d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1) hullplot(df, d1) grp_db <- d1$cluster @@ -398,13 +579,18 @@ table(grp_db) kNNdistplot(df, k = ncol(df)) abline(h = 0.2, col = "red") +abline(h = 0.06, col = "red") fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), - palette = mycols, + #palette = mycols, ellipse.type = "convex", show.clust.cent = FALSE, ggtheme = theme_bw()) +mycols <- c("black", mycols) + +plot(coor_2d, col = mycols[grp_db + 1]) +legend("topleft", paste("Cl", 0:4), col = mycols, pch = 21) rgl::plot3d(coor_3d, col = mycols[grp_db + 1]) aggregate(. ~ grp_db, df, mean) @@ -462,17 +648,24 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df, library(rpart) library(partykit) -dattree_db <- dattree[grp_db != 0, ] +dattree_db <- dattree[grp_db != 0, -1] dattree_db$grp <- factor(grp_db[grp_db != 0]) dattree_db$Pattern <- factor(dattree_db$Pattern) +c1 <- rpart(grp ~ ., data = dattree_db, method = "class") +plot(as.party(c1)) + +c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class") +plot(as.party(c2)) + + + c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves + Pattern, data = dattree_db, method = "class") plot(as.party(c1)) # with conditional tree -c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + - PropMoves + Pattern, data = dattree_db, alpha = 0.5) +c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5) plot(c2) c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + @@ -484,8 +677,7 @@ c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + plot(c4) # with excluded points -c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup + - PropMoves + factor(Pattern), data = dattree, alpha = 1) +c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05) plot(c5) # with excluded points @@ -603,3 +795,5 @@ igraph::centr_degree(inet, loops = FALSE) igraph::centr_betw(inet) igraph::centr_clo(inet) + +