End of day commit

2024-03-08 09:22:10 +01:00 · 2024-03-08 09:22:10 +01:00 · 3cf6c4c51d
commit 3cf6c4c51d
parent 4eca6c81d6
1 changed files with 29 additions and 45 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -166,16 +166,14 @@ normalize <- function(x) {
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 dattree <- data.frame(case = datcase$case,
-                      NumItems = datcase$nitems,
+                      PropItems = datcase$nitems / length(unique(dat$item)),
-                      NumTopic = datcase$nopenTopic,
+                      SearchInfo = datcase$nopenTopic + datcase$nopenPopup,
                      NumPopup = datcase$nopenPopup,
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )
-dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
+dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
 dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
 get_centrality <- function(case, data) {
@ -232,7 +230,12 @@ dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
                          dattree$BetweenCentrality > 0.5, "Star",
                          dattree$Pattern)
 dattree$Pattern <- factor(dattree$Pattern)
 # TODO: Get rid of PathLinearity and Singularity as features when I am
 # using Pattern?
 dattree$PathLinearity <- NULL
 dattree$Singularity <- NULL
 dattree$BetweenCentrality <- NULL
 summary(dattree)
@ -248,6 +251,7 @@ hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
 hist(dattree$BetweenCentrality, breaks = 50, main = "")
 #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Remove cases with extreme outliers
 # TODO: Do I want this???
@ -267,58 +271,31 @@ hist(dattree$PropMoves, breaks = 50, main = "")
 hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
 hist(dattree$BetweenCentrality, breaks = 50, main = "")
-
+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 #--------------- (2) Clustering ---------------
-#library(cluster)
+df <- dattree[, -1] # remove case variable
-#df <- dattree[, -1] # remove case variable
+# Normalize Duration and SearchInfo
-# TODO: Do I need to scale or does normalization also work?
+df$AvDurItem <- normalize(df$AvDurItem)
 df$SearchInfo  <- normalize(df$SearchInfo)
-# Normalize Duration and Numbers
+summary(df)
 # df$AvDurItem <- normalize(df$AvDurItem)
 # df$NumItems  <- normalize(df$NumItems)
 # df$NumTopic  <- normalize(df$NumTopic)
 # df$NumPopup  <- normalize(df$NumPopup)
 # summary(df)
 # Look at collinearity
-# cor_mat <- cor(df)
+cor_mat <- cor(df)
-# diag(cor_mat) <- NA
+diag(cor_mat) <- NA
-# heatmap(cor_mat)
+heatmap(cor_mat)
 #df <- as.data.frame(scale(dattree[, -1]))
 #--------------- (2.2) Hierarchical clustering ---------------
-dist_mat <- cluster::daisy(dattree[, -1], metric = "gower")
+dist_mat <- cluster::daisy(df, metric = "gower")
 # # "Flatten" with PCA
 # mm <- model.matrix( ~ ., df)[, -1]   # remove intercept
 # tmp <- as.data.frame(lapply(as.data.frame(mm), normalize))
 # pc <- prcomp(mm)
 # coor_2d <- as.data.frame(pc$x[, 1:2])
 # coor_3d <- as.data.frame(pc$x[, 1:3])
 # "Flatten" with MDS
 coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
 coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
-
+# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances?
 # Idea from
 # https://stats.stackexchange.com/questions/264912/mds-on-large-dataset-r-or-python
 # https://www.inf.uni-konstanz.de/exalgo/software/mdsj/
 write.table(as.matrix(dist_mat), file = "mds/dist_mat.txt", row.names = FALSE,
            col.names = FALSE)
 # Run java script
 system("java -jar mdsj.jar -d2 mds/dist_mat.txt mds/mds_coor_2d.txt")
 system("java -jar mdsj.jar -d3 mds/dist_mat.txt mds/mds_coor_3d.txt")
 coor_2d_java <- read.table("mds/mds_coor_2d.txt", header = FALSE, sep = " ")
 plot(coor_2d_java)
 plot(coor_2d)
 rgl::plot3d(coor_3d)
@ -326,8 +303,10 @@ rgl::plot3d(coor_3d)
 method <- c(average = "average", single = "single", complete = "complete",
            ward = "ward")
 method <- "ward"
 hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
-acs <- pbapply::sapply(hcs, function(x) x$ac)
+acs <- pbapply::pbsapply(hcs, function(x) x$ac)
 hc <- hcs$ward
@ -351,6 +330,11 @@ table(dattree[cluster == 2, "Pattern"])
 table(dattree[cluster == 3, "Pattern"])
 table(dattree[cluster == 4, "Pattern"])
 table(dattree[cluster == 1, "InfocardOnly"])
 table(dattree[cluster == 2, "InfocardOnly"])
 table(dattree[cluster == 3, "InfocardOnly"])
 table(dattree[cluster == 4, "InfocardOnly"])
 aggregate(. ~ cluster, df, mean)
 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
@ -397,7 +381,7 @@ write.table(res,
            quote = FALSE,
            row.names = FALSE)
-save(res, dist_mat, hcs, acs, datcase, dattree, 
+save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d,
     file = "results/haum/tmp_user-navigation.RData")
 #--------------- (3) Fit tree ---------------