More work on trace clustering

2024-03-04 17:32:50 +01:00 · 2024-03-04 17:32:50 +01:00 · ea0660817a
commit ea0660817a
parent 72d2b6b799
1 changed files with 221 additions and 27 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -126,11 +126,177 @@ datcase$duration <- datcase$max_time - datcase$min_time
 datcase$min_time <- NULL
 datcase$max_time <- NULL
 check_infocards <- function(subdata, artworks) {
  infocard_only <- NULL
  if(any(unique(subdata$item) %in% artworks)) {
    infocard_only <- FALSE
  } else {
    infocard_only <- TRUE
  }
  as.numeric(infocard_only)
 }
 artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
 datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks)
 # Clean up NAs
 datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance)
 datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize)
 datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree)
 datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist)
 datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details)
 datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info)
 datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition)
 datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks)
 datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe)
 datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik)
 datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema)
 datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics)
 datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards)
 cor_mat <- cor(datcase[, -1], use = "pairwise")
 diag(cor_mat) <- NA
 heatmap(cor_mat)
-# TODO: Add info if all items of a case are information cards??
+
 normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
 }
 #df <- as.data.frame(lapply(datcase[, -1], normalize))
 df <- as.data.frame(lapply(datcase[, -1], scale))
 #df <- datcase[, -1]
 # "Flatten" with PCA
 pc <- prcomp(df)
 coor_2d <- as.data.frame(pc$x[, c(1, 2)])
 coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
 plot(coor_2d)
 rgl::plot3d(coor_3d)
 #--------------- (2.1) K-Means clustering ---------------
 mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
 k1 <- kmeans(df, 4)
 grp_km <- k1$cluster
 table(grp_km)
 fviz_cluster(list(data = df, cluster = grp_km),
             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 plot(coor_2d, col = mycols[grp_km])
 rgl::plot3d(coor_3d, col = mycols[grp_km])
 aggregate(. ~ grp_km, df, mean)
 #--------------- (2.2) Hierarchical clustering ---------------
 mat <- dist(df)
 h1 <- hclust(mat, method = "average")
 h2 <- hclust(mat, method = "complete")
 h3 <- hclust(mat, method = "ward.D")
 h4 <- hclust(mat, method = "ward.D2")
 h5 <- hclust(mat, method = "single")
 # Cophenetic Distances, for each linkage (runs quite some time!)
 c1 <- cophenetic(h1)
 c2 <- cophenetic(h2)
 c3 <- cophenetic(h3)
 c4 <- cophenetic(h4)
 c5 <- cophenetic(h5)
 # Correlations
 cor(mat, c1)
 cor(mat, c2)
 cor(mat, c3)
 cor(mat, c4)
 cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
 hc <- h4
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
 k <- 4
 grp_hclust <- cutree(hc, k = k)
 table(grp_hclust)
 fviz_cluster(list(data = df, cluster = grp_hclust),
             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 plot(coor_2d, col = mycols[grp_hclust])
 rgl::plot3d(coor_3d, col = mycols[grp_hclust])
 table(dattree[grp_hclust == 1, "Pattern"])
 table(dattree[grp_hclust == 2, "Pattern"])
 table(dattree[grp_hclust == 3, "Pattern"])
 table(dattree[grp_hclust == 4, "Pattern"])
 aggregate(. ~ grp_hclust, df, mean)
 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
          mean)
 #--------------- (2.3) DBSCAN clustering ---------------
 library(dbscan)
 d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
 hullplot(df, d1)
 grp_db <- d1$cluster
 table(grp_db)
 kNNdistplot(df, k = ncol(df))
 abline(h = 0.2, col = "red")
 abline(h = 0.06, col = "red")
 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
             #palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 mycols <- c("black", mycols)
 plot(coor_2d, col = mycols[grp_db + 1])
 rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
 aggregate(. ~ grp_db, df, mean)
 table(dattree[grp_db == 0, "Pattern"])
 table(dattree[grp_db == 1, "Pattern"])
 table(dattree[grp_db == 2, "Pattern"])
 table(dattree[grp_db == 3, "Pattern"])
 # Does not really work with these features!
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Navigation types by Bousbia et al. (2010):
 # - Overviewing: this value is close to the Canter “scanning” value. It
@ -181,26 +347,28 @@ heatmap(cor_mat)
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 dattree <- data.frame(case = datcase$case,
-                      Duration = datcase$duration,
+                      #Duration = datcase$duration,
                      PropItems = datcase$nitems / length(unique(dat$item)),
-                      PropTopic = datcase$nopenTopic / datcase$nflipCard,
+                      #PropTopic = datcase$nopenTopic / datcase$nflipCard,
-                      PropPopup = datcase$nopenPopup / datcase$nopenTopic,
+                      #PropPopup = datcase$nopenPopup / datcase$nopenTopic,
                      # SearchInfo = 
                      #   2*(((datcase$nopenPopup / datcase$nopenTopic) *
                      #       (datcase$nopenTopic / datcase$nflipCard)) /
                      # ((datcase$nopenPopup / datcase$nopenTopic) +
                      #  (datcase$nopenTopic / datcase$nflipCard))
                      # ),
                      SearchInfo = datcase$nopenTopic / datcase$nflipCard +
                        datcase$nopenPopup / datcase$nopenTopic,
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )
 #dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
-#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
+dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
-dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
+#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
-dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
+#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
 get_centrality <- function(case, data) {
@ -227,9 +395,10 @@ centrality <- do.call(rbind, centrality)
 save(centrality, file = "results/haum/tmp_centrality.RData")
-dattree$centr_degree <- centrality[, 1]
+#dattree$centr_degree <- centrality[, 1]
-dattree$centr_degree_loops <- centrality[, 2]
+#dattree$centr_degree_loops <- centrality[, 2]
-dattree$centr_between <- centrality[, 3]
+dattree$DegreeCentrality <- centrality[, 2]
 #dattree$BetweenCentrality <- centrality[, 3]
 ## Add average duration per item
@ -247,7 +416,7 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
 rm(tmp)
-plot(dattree)
+plot(dattree[, -1], pch = ".")
 par(mfrow = c(3,4))
 hist(dattree$Duration, breaks = 50, main = "")
@ -268,13 +437,13 @@ diag(cor_mat) <- NA
 heatmap(cor_mat)
-dattree$Pattern <- "Dispersion"
+# dattree$Pattern <- "Dispersion"
-dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
+# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
-                          dattree$Singularity > 0.8, "Scholar",
+#                           dattree$Singularity > 0.8, "Scholar",
-                          dattree$Pattern)
+#                           dattree$Pattern)
-dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
+# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
-                          dattree$centr_between > 0.5, "Star",
+#                           dattree$centr_between > 0.5, "Star",
-                          dattree$Pattern)
+#                           dattree$Pattern)
 write.table(dattree,
            file = "results/haum/dattree.csv",
@ -302,11 +471,22 @@ df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
  (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
 #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 df <- dattree[, -1]
 df$AvDurItem <- normalize(df$AvDurItem)
 df$SearchInfo <- normalize(df$SearchInfo)
 df$InfocardOnly <- datcase$infocardOnly
 summary(df)
 #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # "Flatten" with PCA
 pc <- prcomp(df)
 coor_2d <- as.data.frame(pc$x[, c(1, 2)])
 coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
 plot(coor_2d)
 rgl::plot3d(coor_3d)
 #--------------- (2.1) K-Means clustering ---------------
 mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
@ -326,10 +506,11 @@ plot(coor_2d, col = mycols[grp_km])
 rgl::plot3d(coor_3d, col = mycols[grp_km])
 aggregate(. ~ grp_km, df, mean)
 #--------------- (2.2) Hierarchical clustering ---------------
 mat <- dist(df)
 # TODO: Do I need to scale all variables?
 h1 <- hclust(mat, method = "average")
 h2 <- hclust(mat, method = "complete")
@ -354,7 +535,7 @@ cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
-hc <- h1
+hc <- h4
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -390,7 +571,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
 #--------------- (2.3) DBSCAN clustering ---------------
 library(dbscan)
-d1 <- dbscan(df, eps = .2, minPts = 9)
+d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
 hullplot(df, d1)
 grp_db <- d1$cluster
@ -398,13 +579,18 @@ table(grp_db)
 kNNdistplot(df, k = ncol(df))
 abline(h = 0.2, col = "red")
 abline(h = 0.06, col = "red")
 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
-             palette = mycols,
+             #palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 mycols <- c("black", mycols)
 plot(coor_2d, col = mycols[grp_db + 1])
 legend("topleft", paste("Cl", 0:4), col = mycols, pch = 21)
 rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
 aggregate(. ~ grp_db, df, mean)
@ -462,17 +648,24 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
 library(rpart)
 library(partykit)
-dattree_db <- dattree[grp_db != 0, ]
+dattree_db <- dattree[grp_db != 0, -1]
 dattree_db$grp <- factor(grp_db[grp_db != 0])
 dattree_db$Pattern <- factor(dattree_db$Pattern)
 c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
 plot(as.party(c1))
 c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
 plot(as.party(c2))
 c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
              Pattern, data = dattree_db, method = "class")
 plot(as.party(c1))
 # with conditional tree
-c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
+c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
            PropMoves + Pattern, data = dattree_db, alpha = 0.5)
 plot(c2)
 c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
@ -484,8 +677,7 @@ c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
 plot(c4)
 # with excluded points
-c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
+c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
            PropMoves + factor(Pattern), data = dattree, alpha = 1)
 plot(c5)
 # with excluded points
@ -603,3 +795,5 @@ igraph::centr_degree(inet, loops = FALSE)
 igraph::centr_betw(inet)
 igraph::centr_clo(inet)