From 3cf6c4c51d39708ff5db8db65c1c123cab4847aa Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 8 Mar 2024 09:22:10 +0100 Subject: [PATCH] End of day commit --- code/09_user-navigation.R | 74 +++++++++++++++------------------------ 1 file changed, 29 insertions(+), 45 deletions(-) diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index f90410d..b6f5b02 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -166,16 +166,14 @@ normalize <- function(x) { # * Duration per artwork is low: "ave_duration_item" / datcase$duration dattree <- data.frame(case = datcase$case, - NumItems = datcase$nitems, - NumTopic = datcase$nopenTopic, - NumPopup = datcase$nopenPopup, + PropItems = datcase$nitems / length(unique(dat$item)), + SearchInfo = datcase$nopenTopic + datcase$nopenPopup, PropMoves = datcase$nmove / datcase$length, PathLinearity = datcase$nitems / datcase$npaths, Singularity = datcase$npaths / datcase$length ) -dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic) -dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup) +dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic) get_centrality <- function(case, data) { @@ -232,7 +230,12 @@ dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & dattree$BetweenCentrality > 0.5, "Star", dattree$Pattern) dattree$Pattern <- factor(dattree$Pattern) +# TODO: Get rid of PathLinearity and Singularity as features when I am +# using Pattern? +dattree$PathLinearity <- NULL +dattree$Singularity <- NULL +dattree$BetweenCentrality <- NULL summary(dattree) @@ -248,6 +251,7 @@ hist(dattree$PathLinearity, breaks = 50, main = "") hist(dattree$Singularity, breaks = 50, main = "") hist(dattree$BetweenCentrality, breaks = 50, main = "") +#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Remove cases with extreme outliers # TODO: Do I want this??? @@ -267,58 +271,31 @@ hist(dattree$PropMoves, breaks = 50, main = "") hist(dattree$PathLinearity, breaks = 50, main = "") hist(dattree$Singularity, breaks = 50, main = "") hist(dattree$BetweenCentrality, breaks = 50, main = "") - +#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #--------------- (2) Clustering --------------- -#library(cluster) +df <- dattree[, -1] # remove case variable -#df <- dattree[, -1] # remove case variable -# TODO: Do I need to scale or does normalization also work? +# Normalize Duration and SearchInfo +df$AvDurItem <- normalize(df$AvDurItem) +df$SearchInfo <- normalize(df$SearchInfo) -# Normalize Duration and Numbers -# df$AvDurItem <- normalize(df$AvDurItem) -# df$NumItems <- normalize(df$NumItems) -# df$NumTopic <- normalize(df$NumTopic) -# df$NumPopup <- normalize(df$NumPopup) - -# summary(df) +summary(df) # Look at collinearity -# cor_mat <- cor(df) -# diag(cor_mat) <- NA -# heatmap(cor_mat) - -#df <- as.data.frame(scale(dattree[, -1])) - +cor_mat <- cor(df) +diag(cor_mat) <- NA +heatmap(cor_mat) #--------------- (2.2) Hierarchical clustering --------------- -dist_mat <- cluster::daisy(dattree[, -1], metric = "gower") - -# # "Flatten" with PCA -# mm <- model.matrix( ~ ., df)[, -1] # remove intercept -# tmp <- as.data.frame(lapply(as.data.frame(mm), normalize)) -# pc <- prcomp(mm) -# coor_2d <- as.data.frame(pc$x[, 1:2]) -# coor_3d <- as.data.frame(pc$x[, 1:3]) +dist_mat <- cluster::daisy(df, metric = "gower") # "Flatten" with MDS coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) - -# Idea from -# https://stats.stackexchange.com/questions/264912/mds-on-large-dataset-r-or-python -# https://www.inf.uni-konstanz.de/exalgo/software/mdsj/ -write.table(as.matrix(dist_mat), file = "mds/dist_mat.txt", row.names = FALSE, - col.names = FALSE) -# Run java script -system("java -jar mdsj.jar -d2 mds/dist_mat.txt mds/mds_coor_2d.txt") -system("java -jar mdsj.jar -d3 mds/dist_mat.txt mds/mds_coor_3d.txt") - -coor_2d_java <- read.table("mds/mds_coor_2d.txt", header = FALSE, sep = " ") -plot(coor_2d_java) - +# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? plot(coor_2d) rgl::plot3d(coor_3d) @@ -326,8 +303,10 @@ rgl::plot3d(coor_3d) method <- c(average = "average", single = "single", complete = "complete", ward = "ward") +method <- "ward" + hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) -acs <- pbapply::sapply(hcs, function(x) x$ac) +acs <- pbapply::pbsapply(hcs, function(x) x$ac) hc <- hcs$ward @@ -351,6 +330,11 @@ table(dattree[cluster == 2, "Pattern"]) table(dattree[cluster == 3, "Pattern"]) table(dattree[cluster == 4, "Pattern"]) +table(dattree[cluster == 1, "InfocardOnly"]) +table(dattree[cluster == 2, "InfocardOnly"]) +table(dattree[cluster == 3, "InfocardOnly"]) +table(dattree[cluster == 4, "InfocardOnly"]) + aggregate(. ~ cluster, df, mean) aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, @@ -397,7 +381,7 @@ write.table(res, quote = FALSE, row.names = FALSE) -save(res, dist_mat, hcs, acs, datcase, dattree, +save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, file = "results/haum/tmp_user-navigation.RData") #--------------- (3) Fit tree ---------------