From f8c176707476230b0e399482efa9f3d0b4d4b8b6 Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 5 Mar 2024 16:25:49 +0100 Subject: [PATCH] Played around with clustering; switched to glower distances and removed kmeans clustering --- code/09_user-navigation.R | 433 ++++++++++---------------------------- 1 file changed, 117 insertions(+), 316 deletions(-) diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index 9c67f74..453cd5b 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -74,10 +74,12 @@ topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6] topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7] topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8] -datcase <- datcase |> +datcase <- datcase |> merge(eventtab, by = "case", all = TRUE) |> merge(topictab, by = "case", all = TRUE) +rm(eventtab, topictab) + datcase$ntopiccards <- aggregate(topic ~ case, dat, function(x) ifelse(all(is.na(x)), NA, length(na.omit(x))), na.action = @@ -136,6 +138,7 @@ check_infocards <- function(subdata, artworks) { } as.numeric(infocard_only) } +# TODO: Move to helper file artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] @@ -167,214 +170,43 @@ heatmap(cor_mat) normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) } - - -#df <- as.data.frame(lapply(datcase[, -1], normalize)) -df <- as.data.frame(lapply(datcase[, -1], scale)) -#df <- datcase[, -1] - -# "Flatten" with PCA -pc <- prcomp(df) -coor_2d <- as.data.frame(pc$x[, c(1, 2)]) -coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)]) - -plot(coor_2d) -rgl::plot3d(coor_3d) - -#--------------- (2.1) K-Means clustering --------------- - -mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") - -k1 <- kmeans(df, 4) - -grp_km <- k1$cluster -table(grp_km) - -fviz_cluster(list(data = df, cluster = grp_km), - palette = mycols, - ellipse.type = "convex", - show.clust.cent = FALSE, - ggtheme = theme_bw()) - -plot(coor_2d, col = mycols[grp_km]) - -rgl::plot3d(coor_3d, col = mycols[grp_km]) - -aggregate(. ~ grp_km, df, mean) - -#--------------- (2.2) Hierarchical clustering --------------- - -mat <- dist(df) - -h1 <- hclust(mat, method = "average") -h2 <- hclust(mat, method = "complete") -h3 <- hclust(mat, method = "ward.D") -h4 <- hclust(mat, method = "ward.D2") -h5 <- hclust(mat, method = "single") - -# Cophenetic Distances, for each linkage (runs quite some time!) -c1 <- cophenetic(h1) -c2 <- cophenetic(h2) -c3 <- cophenetic(h3) -c4 <- cophenetic(h4) -c5 <- cophenetic(h5) - -# Correlations -cor(mat, c1) -cor(mat, c2) -cor(mat, c3) -cor(mat, c4) -cor(mat, c5) - -# https://en.wikipedia.org/wiki/Cophenetic_correlation -# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering - -hc <- h4 - -# Something like a scree plot (??) -plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) - -k <- 4 - -grp_hclust <- cutree(hc, k = k) - -table(grp_hclust) - -fviz_cluster(list(data = df, cluster = grp_hclust), - palette = mycols, - ellipse.type = "convex", - show.clust.cent = FALSE, - ggtheme = theme_bw()) - -plot(coor_2d, col = mycols[grp_hclust]) -rgl::plot3d(coor_3d, col = mycols[grp_hclust]) - -table(dattree[grp_hclust == 1, "Pattern"]) -table(dattree[grp_hclust == 2, "Pattern"]) -table(dattree[grp_hclust == 3, "Pattern"]) -table(dattree[grp_hclust == 4, "Pattern"]) - - -aggregate(. ~ grp_hclust, df, mean) - - -aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, - nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase, - mean) - -#--------------- (2.3) DBSCAN clustering --------------- - -library(dbscan) -d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1) -hullplot(df, d1) - -grp_db <- d1$cluster -table(grp_db) - -kNNdistplot(df, k = ncol(df)) -abline(h = 0.2, col = "red") -abline(h = 0.06, col = "red") - -fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), - #palette = mycols, - ellipse.type = "convex", - show.clust.cent = FALSE, - ggtheme = theme_bw()) - -mycols <- c("black", mycols) - -plot(coor_2d, col = mycols[grp_db + 1]) -rgl::plot3d(coor_3d, col = mycols[grp_db + 1]) - -aggregate(. ~ grp_db, df, mean) - -table(dattree[grp_db == 0, "Pattern"]) -table(dattree[grp_db == 1, "Pattern"]) -table(dattree[grp_db == 2, "Pattern"]) -table(dattree[grp_db == 3, "Pattern"]) - -# Does not really work with these features! - - -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -# Navigation types by Bousbia et al. (2010): -# - Overviewing: this value is close to the Canter “scanning” value. It -# implies that the learner is covering a large proportion of course pages. -# Through this phase of fast-reading, the user seeks to acquire an -# overall view of the course. -# - Flitting: close to “wandering”. It reflects a browsing activity without a -# strategy or a particular goal. The main difference with the overviewing -# type is the lack of focus on the course. -# - Studying: corresponds to a partial or complete reading of the course -# pages where the learner spends time on each page. -# - Deepening: This describes a learner who spends relatively long time on a -# course, checking details, and seeking Web documents related to the course -# topics. The main difference with studying is the Web search part that the -# learner uses to obtain a deeper understanding of the course. - -# Taxonomy defined by Canter et al. (1985): -# - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia) -# by requesting an important proportion of its pages but without spending -# much time on them. -# - Browsing: going wherever the data leads the navigator until catching an -# interest. -# - Exploring: reading the viewed pages thoroughly. -# - Searching: seeking for a particular document or information. -# - Wandering: navigating in an unstructured fashion without any particular -# goal or strategy. +# TODO: Move to helper file # Features for navigation types for MTT: # - Scanning / Overviewing: -# * Proportion of artworks looked at is high: datcase$nitems / 70 +# * Proportion of artworks looked at is high # * Duration per artwork is low: "ave_duration_item" / datcase$duration # - Exploring: -# * Looking at additional information for most items touched (high value): -# harmonic mean of datcase$nopenTopic / datcase$nflipCard and -# datcase$nopenPopup / datcase$nopenTopic +# * Looking at additional information is high # - Searching / Studying: -# * Looking only at a few items -# datcase$nitems / 70 is low +# * Proportion of artworks looked at is low # * Opening few cards # datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low # * but for most cards popups are opened: # datcase$nopenPopup / datcase$nflipCard is high # - Wandering / Flitting: -# * Items are mostly just moved: -# datcase$nmove / datcase$length is high +# * Proportion of moves is high # * Duration per case is low: # datcase$duration / mean(datcase$duration) or median(datcase$duration) # * Duration per artwork is low: "ave_duration_item" / datcase$duration dattree <- data.frame(case = datcase$case, - #Duration = datcase$duration, - PropItems = datcase$nitems / length(unique(dat$item)), - #PropTopic = datcase$nopenTopic / datcase$nflipCard, - #PropPopup = datcase$nopenPopup / datcase$nopenTopic, - # SearchInfo = - # 2*(((datcase$nopenPopup / datcase$nopenTopic) * - # (datcase$nopenTopic / datcase$nflipCard)) / - # ((datcase$nopenPopup / datcase$nopenTopic) + - # (datcase$nopenTopic / datcase$nflipCard)) - # ), - SearchInfo = datcase$nopenTopic / datcase$nflipCard + - datcase$nopenPopup / datcase$nopenTopic, + NumItems = datcase$nitems, + NumTopic = datcase$nopenTopic, + NumPopup = datcase$nopenPopup, PropMoves = datcase$nmove / datcase$length, PathLinearity = datcase$nitems / datcase$npaths, Singularity = datcase$npaths / datcase$length ) -#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo) -dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) - -#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic) -#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup) +dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic) +dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup) get_centrality <- function(case, data) { data$start <- data$date.start data$complete <- data$date.stop - + alog <- activitylog(data[data$case == case, ], case_id = "case", activity_id = "item", @@ -388,17 +220,16 @@ get_centrality <- function(case, data) { igraph::centr_degree(inet, loops = TRUE)$centralization, igraph::centr_betw(inet)$centralization) } +# TODO: Move to helper file +# centrality <- lapply(dattree$case, get_centrality, data = dat) +# centrality <- do.call(rbind, centrality) +# +# save(centrality, file = "results/haum/tmp_centrality.RData") +load("results/haum/tmp_centrality.RData") -centrality <- lapply(dattree$case, get_centrality, data = dat) -centrality <- do.call(rbind, centrality) - -save(centrality, file = "results/haum/tmp_centrality.RData") - -#dattree$centr_degree <- centrality[, 1] -#dattree$centr_degree_loops <- centrality[, 2] -dattree$DegreeCentrality <- centrality[, 2] -#dattree$BetweenCentrality <- centrality[, 3] +#dattree$DegreeCentrality <- centrality[, 2] +dattree$BetweenCentrality <- centrality[, 3] ## Add average duration per item @@ -413,135 +244,118 @@ tmp$duration <- tmp$max_time - tmp$min_time tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration +#dattree$AvDurItem <- dattree$AvDurItem / datcase$duration rm(tmp) +summary(dattree) + plot(dattree[, -1], pch = ".") -par(mfrow = c(3,4)) -hist(dattree$Duration, breaks = 50, main = "") +par(mfrow = c(2,4)) hist(dattree$AvDurItem, breaks = 50, main = "") -hist(dattree$PropItems, breaks = 50, main = "") -hist(dattree$PropTopic, breaks = 50, main = "") -hist(dattree$PropPopup, breaks = 50, main = "") +hist(dattree$NumItems, breaks = 50, main = "") +hist(dattree$NumTopic, breaks = 50, main = "") +hist(dattree$NumPopup, breaks = 50, main = "") hist(dattree$PropMoves, breaks = 50, main = "") hist(dattree$PathLinearity, breaks = 50, main = "") hist(dattree$Singularity, breaks = 50, main = "") -hist(dattree$centr_degree, breaks = 50, main = "") -hist(dattree$centr_degree_loops, breaks = 50, main = "") -hist(dattree$centr_between, breaks = 50, main = "") +hist(dattree$BetweenCentrality, breaks = 50, main = "") + +# Indicator variable if table was used as info terminal only +dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, labels = c("no", "yes")) + +# Add pattern to datcase; loosely based on Bousbia et al. (2009) +dattree$Pattern <- "Dispersion" +dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar", + dattree$Pattern) +dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & + dattree$BetweenCentrality > 0.5, "Star", + dattree$Pattern) +dattree$Pattern <- factor(dattree$Pattern) -cor_mat <- cor(dattree[, -1], use = "pairwise") -diag(cor_mat) <- NA -heatmap(cor_mat) +# Remove cases with extreme outliers +# TODO: Do I want this??? +quantile(datcase$nopenTopic, 0.999) +quantile(datcase$nopenPopup, 0.999) -# dattree$Pattern <- "Dispersion" -# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & -# dattree$Singularity > 0.8, "Scholar", -# dattree$Pattern) -# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & -# dattree$centr_between > 0.5, "Star", -# dattree$Pattern) +dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ] + +plot(dattree[, -1], pch = ".") + +par(mfrow = c(2,4)) +hist(dattree$AvDurItem, breaks = 50, main = "") +hist(dattree$NumItems, breaks = 50, main = "") +hist(dattree$NumTopic, breaks = 50, main = "") +hist(dattree$NumPopup, breaks = 50, main = "") +hist(dattree$PropMoves, breaks = 50, main = "") +hist(dattree$PathLinearity, breaks = 50, main = "") +hist(dattree$Singularity, breaks = 50, main = "") +hist(dattree$BetweenCentrality, breaks = 50, main = "") -write.table(dattree, - file = "results/haum/dattree.csv", - sep = ";", - quote = FALSE, - row.names = FALSE) #--------------- (2) Clustering --------------- -df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")] -#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")] -df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) -df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) -df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) +library(cluster) -# scale Duration and min/max SearchInfo -#df$AvDurItem <- as.numeric(scale(df$AvDurItem)) -df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) / - (max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE)) -#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / -# (max(df$SearchInfo) - min(df$SearchInfo)) -df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) / - (max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE)) -df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) / - (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE)) +df <- dattree[1:10000, -1] # remove case variable +# TODO: Do I need to scale or does normalization also work? + +# Normalize Duration and Numbers +# df$AvDurItem <- normalize(df$AvDurItem) +# df$NumItems <- normalize(df$NumItems) +# df$NumTopic <- normalize(df$NumTopic) +# df$NumPopup <- normalize(df$NumPopup) + +# summary(df) + +# Look at collinearity +cor_mat <- cor(df) +diag(cor_mat) <- NA +heatmap(cor_mat) + +#df <- as.data.frame(scale(dattree[, -1])) -#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -df <- dattree[, -1] -df$AvDurItem <- normalize(df$AvDurItem) -df$SearchInfo <- normalize(df$SearchInfo) -df$InfocardOnly <- datcase$infocardOnly -summary(df) -#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +#--------------- (2.2) Hierarchical clustering --------------- -# "Flatten" with PCA -pc <- prcomp(df) -coor_2d <- as.data.frame(pc$x[, c(1, 2)]) -coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)]) +mat <- daisy(df, metric = "gower") + +# "Flatten" with MDS +coor_2d <- as.data.frame(cmdscale(mat, k = 2)) +coor_3d <- as.data.frame(cmdscale(mat, k = 3)) plot(coor_2d) rgl::plot3d(coor_3d) -#--------------- (2.1) K-Means clustering --------------- +#mat <- dist(df) -mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") +# https://uc-r.github.io/hc_clustering +method <- c(average = "average", single = "single", complete = "complete", + ward = "ward.D2") -k1 <- kmeans(df, 4) +hc_method <- function(x) { + hclust(mat, method = x) +} -grp_km <- k1$cluster -table(grp_km) - -fviz_cluster(list(data = df, cluster = grp_km), - palette = mycols, - ellipse.type = "convex", - show.clust.cent = FALSE, - ggtheme = theme_bw()) - -plot(coor_2d, col = mycols[grp_km]) - -rgl::plot3d(coor_3d, col = mycols[grp_km]) - -aggregate(. ~ grp_km, df, mean) - -#--------------- (2.2) Hierarchical clustering --------------- - -mat <- dist(df) - -h1 <- hclust(mat, method = "average") -h2 <- hclust(mat, method = "complete") -h3 <- hclust(mat, method = "ward.D") -h4 <- hclust(mat, method = "ward.D2") -h5 <- hclust(mat, method = "single") - -# Cophenetic Distances, for each linkage (runs quite some time!) -c1 <- cophenetic(h1) -c2 <- cophenetic(h2) -c3 <- cophenetic(h3) -c4 <- cophenetic(h4) -c5 <- cophenetic(h5) - -# Correlations -cor(mat, c1) -cor(mat, c2) -cor(mat, c3) -cor(mat, c4) -cor(mat, c5) +hcs <- lapply(method, hc_method) +cds <- lapply(hcs, cophenetic) +cors <- sapply(cds, cor, y = mat) # https://en.wikipedia.org/wiki/Cophenetic_correlation # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering -hc <- h4 +hc <- hcs$average # Something like a scree plot (??) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) k <- 4 +mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") + grp_hclust <- cutree(hc, k = k) table(grp_hclust) @@ -553,17 +367,16 @@ fviz_cluster(list(data = df, cluster = grp_hclust), ggtheme = theme_bw()) plot(coor_2d, col = mycols[grp_hclust]) +legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) rgl::plot3d(coor_3d, col = mycols[grp_hclust]) -table(dattree[grp_hclust == 1, "Pattern"]) -table(dattree[grp_hclust == 2, "Pattern"]) -table(dattree[grp_hclust == 3, "Pattern"]) -table(dattree[grp_hclust == 4, "Pattern"]) - +table(datcase[grp_hclust == 1, "Pattern"]) +table(datcase[grp_hclust == 2, "Pattern"]) +table(datcase[grp_hclust == 3, "Pattern"]) +table(datcase[grp_hclust == 4, "Pattern"]) aggregate(. ~ grp_hclust, df, mean) - aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase, mean) @@ -571,7 +384,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, #--------------- (2.3) DBSCAN clustering --------------- library(dbscan) -d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1) +d1 <- dbscan(df, eps = 1, minPts = ncol(df) + 1) hullplot(df, d1) grp_db <- d1$cluster @@ -579,10 +392,10 @@ table(grp_db) kNNdistplot(df, k = ncol(df)) abline(h = 0.2, col = "red") -abline(h = 0.06, col = "red") +abline(h = 1, col = "red") fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), - #palette = mycols, + palette = mycols, ellipse.type = "convex", show.clust.cent = FALSE, ggtheme = theme_bw()) @@ -595,17 +408,20 @@ rgl::plot3d(coor_3d, col = mycols[grp_db + 1]) aggregate(. ~ grp_db, df, mean) -table(dattree[grp_db == 0, "Pattern"]) -table(dattree[grp_db == 1, "Pattern"]) -table(dattree[grp_db == 2, "Pattern"]) -table(dattree[grp_db == 3, "Pattern"]) +table(datcase[grp_db == 0, "Pattern"]) +table(datcase[grp_db == 1, "Pattern"]) +table(datcase[grp_db == 2, "Pattern"]) +table(datcase[grp_db == 3, "Pattern"]) +table(datcase[grp_db == 4, "Pattern"]) ### Look at selected cases ########################################### +dattree[grp_db == 0, ] + tmp <- dat tmp$start <- tmp$date.start tmp$complete <- tmp$date.stop -alog <- activitylog(tmp[tmp$case == 30418, ], +alog <- activitylog(tmp[tmp$case == 15, ], case_id = "case", activity_id = "item", resource_id = "path", @@ -650,38 +466,23 @@ library(partykit) dattree_db <- dattree[grp_db != 0, -1] dattree_db$grp <- factor(grp_db[grp_db != 0]) -dattree_db$Pattern <- factor(dattree_db$Pattern) c1 <- rpart(grp ~ ., data = dattree_db, method = "class") plot(as.party(c1)) -c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class") +c2 <- rpart(as.factor(grp_hclust) ~ ., data = dattree[, -1], method = "class") plot(as.party(c2)) - - -c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves + - Pattern, data = dattree_db, method = "class") -plot(as.party(c1)) - # with conditional tree -c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5) +c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.05) plot(c2) -c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + - PropMoves + Pattern, data = dattree_db, alpha = 0) -plot(c3) - -c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + - PropMoves + Pattern, data = dattree_db, alpha = 1) -plot(c4) - # with excluded points -c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05) +c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0) plot(c5) # with excluded points -c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1) +c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 0) plot(c6) # --> just checking