End of day commit

2024-03-08 09:22:10 +01:00 · 2024-03-08 09:22:10 +01:00 · 3cf6c4c51d
commit 3cf6c4c51d
parent 4eca6c81d6
1 changed files with 29 additions and 45 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -166,16 +166,14 @@ normalize <- function(x) {
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration

 dattree <- data.frame(case = datcase$case,
-                      NumItems = datcase$nitems,
-                      NumTopic = datcase$nopenTopic,
-                      NumPopup = datcase$nopenPopup,
+                      PropItems = datcase$nitems / length(unique(dat$item)),
+                      SearchInfo = datcase$nopenTopic + datcase$nopenPopup,
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )

-dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
-dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
+dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)

 get_centrality <- function(case, data) {

@ -232,7 +230,12 @@ dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
                          dattree$BetweenCentrality > 0.5, "Star",
                          dattree$Pattern)
 dattree$Pattern <- factor(dattree$Pattern)
+# TODO: Get rid of PathLinearity and Singularity as features when I am
+# using Pattern?

+dattree$PathLinearity <- NULL
+dattree$Singularity <- NULL
+dattree$BetweenCentrality <- NULL

 summary(dattree)

@ -248,6 +251,7 @@ hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
 hist(dattree$BetweenCentrality, breaks = 50, main = "")

+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Remove cases with extreme outliers
 # TODO: Do I want this???

@ -267,58 +271,31 @@ hist(dattree$PropMoves, breaks = 50, main = "")
 hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
 hist(dattree$BetweenCentrality, breaks = 50, main = "")
-
+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 #--------------- (2) Clustering ---------------

-#library(cluster)
+df <- dattree[, -1] # remove case variable

-#df <- dattree[, -1] # remove case variable
-# TODO: Do I need to scale or does normalization also work?
+# Normalize Duration and SearchInfo
+df$AvDurItem <- normalize(df$AvDurItem)
+df$SearchInfo  <- normalize(df$SearchInfo)

-# Normalize Duration and Numbers
-# df$AvDurItem <- normalize(df$AvDurItem)
-# df$NumItems  <- normalize(df$NumItems)
-# df$NumTopic  <- normalize(df$NumTopic)
-# df$NumPopup  <- normalize(df$NumPopup)
-
-# summary(df)
+summary(df)

 # Look at collinearity
-# cor_mat <- cor(df)
-# diag(cor_mat) <- NA
-# heatmap(cor_mat)
-
-#df <- as.data.frame(scale(dattree[, -1]))
-
+cor_mat <- cor(df)
+diag(cor_mat) <- NA
+heatmap(cor_mat)

 #--------------- (2.2) Hierarchical clustering ---------------

-dist_mat <- cluster::daisy(dattree[, -1], metric = "gower")
-
-# # "Flatten" with PCA
-# mm <- model.matrix( ~ ., df)[, -1]   # remove intercept
-# tmp <- as.data.frame(lapply(as.data.frame(mm), normalize))
-# pc <- prcomp(mm)
-# coor_2d <- as.data.frame(pc$x[, 1:2])
-# coor_3d <- as.data.frame(pc$x[, 1:3])
+dist_mat <- cluster::daisy(df, metric = "gower")

 # "Flatten" with MDS
 coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
 coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
-
-# Idea from
-# https://stats.stackexchange.com/questions/264912/mds-on-large-dataset-r-or-python
-# https://www.inf.uni-konstanz.de/exalgo/software/mdsj/
-write.table(as.matrix(dist_mat), file = "mds/dist_mat.txt", row.names = FALSE,
-            col.names = FALSE)
-# Run java script
-system("java -jar mdsj.jar -d2 mds/dist_mat.txt mds/mds_coor_2d.txt")
-system("java -jar mdsj.jar -d3 mds/dist_mat.txt mds/mds_coor_3d.txt")
-
-coor_2d_java <- read.table("mds/mds_coor_2d.txt", header = FALSE, sep = " ")
-plot(coor_2d_java)
-
+# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances?

 plot(coor_2d)
 rgl::plot3d(coor_3d)
@ -326,8 +303,10 @@ rgl::plot3d(coor_3d)
 method <- c(average = "average", single = "single", complete = "complete",
            ward = "ward")

+method <- "ward"
+
 hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
-acs <- pbapply::sapply(hcs, function(x) x$ac)
+acs <- pbapply::pbsapply(hcs, function(x) x$ac)

 hc <- hcs$ward

@ -351,6 +330,11 @@ table(dattree[cluster == 2, "Pattern"])
 table(dattree[cluster == 3, "Pattern"])
 table(dattree[cluster == 4, "Pattern"])

+table(dattree[cluster == 1, "InfocardOnly"])
+table(dattree[cluster == 2, "InfocardOnly"])
+table(dattree[cluster == 3, "InfocardOnly"])
+table(dattree[cluster == 4, "InfocardOnly"])
+
 aggregate(. ~ cluster, df, mean)

 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
@ -397,7 +381,7 @@ write.table(res,
            quote = FALSE,
            row.names = FALSE)

-save(res, dist_mat, hcs, acs, datcase, dattree, 
+save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d,
     file = "results/haum/tmp_user-navigation.RData")

 #--------------- (3) Fit tree ---------------