More work on trace clustering

2024-03-04 17:32:50 +01:00 · 2024-03-04 17:32:50 +01:00 · ea0660817a
commit ea0660817a
parent 72d2b6b799
1 changed files with 221 additions and 27 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -126,11 +126,177 @@ datcase$duration <- datcase$max_time - datcase$min_time
 datcase$min_time <- NULL
 datcase$max_time <- NULL

+
+check_infocards <- function(subdata, artworks) {
+  infocard_only <- NULL
+  if(any(unique(subdata$item) %in% artworks)) {
+    infocard_only <- FALSE
+  } else {
+    infocard_only <- TRUE
+  }
+  as.numeric(infocard_only)
+}
+
+artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
+
+datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks)
+
+
+# Clean up NAs
+datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance)
+datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize)
+datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree)
+datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist)
+datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details)
+datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info)
+datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition)
+datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks)
+datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe)
+datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik)
+datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema)
+datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics)
+datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards)
+
+
+
 cor_mat <- cor(datcase[, -1], use = "pairwise")
 diag(cor_mat) <- NA
 heatmap(cor_mat)

-# TODO: Add info if all items of a case are information cards??
+
+normalize <- function(x) {
+  (x - min(x)) / (max(x) - min(x))
+}
+
+
+#df <- as.data.frame(lapply(datcase[, -1], normalize))
+df <- as.data.frame(lapply(datcase[, -1], scale))
+#df <- datcase[, -1]
+
+# "Flatten" with PCA
+pc <- prcomp(df)
+coor_2d <- as.data.frame(pc$x[, c(1, 2)])
+coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
+
+plot(coor_2d)
+rgl::plot3d(coor_3d)
+
+#--------------- (2.1) K-Means clustering ---------------
+
+mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
+
+k1 <- kmeans(df, 4)
+
+grp_km <- k1$cluster
+table(grp_km)
+
+fviz_cluster(list(data = df, cluster = grp_km),
+             palette = mycols,
+             ellipse.type = "convex",
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
+
+plot(coor_2d, col = mycols[grp_km])
+
+rgl::plot3d(coor_3d, col = mycols[grp_km])
+
+aggregate(. ~ grp_km, df, mean)
+
+#--------------- (2.2) Hierarchical clustering ---------------
+
+mat <- dist(df)
+
+h1 <- hclust(mat, method = "average")
+h2 <- hclust(mat, method = "complete")
+h3 <- hclust(mat, method = "ward.D")
+h4 <- hclust(mat, method = "ward.D2")
+h5 <- hclust(mat, method = "single")
+
+# Cophenetic Distances, for each linkage (runs quite some time!)
+c1 <- cophenetic(h1)
+c2 <- cophenetic(h2)
+c3 <- cophenetic(h3)
+c4 <- cophenetic(h4)
+c5 <- cophenetic(h5)
+
+# Correlations
+cor(mat, c1)
+cor(mat, c2)
+cor(mat, c3)
+cor(mat, c4)
+cor(mat, c5)
+
+# https://en.wikipedia.org/wiki/Cophenetic_correlation
+# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
+
+hc <- h4
+
+# Something like a scree plot (??)
+plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
+
+k <- 4
+
+grp_hclust <- cutree(hc, k = k)
+
+table(grp_hclust)
+
+fviz_cluster(list(data = df, cluster = grp_hclust),
+             palette = mycols,
+             ellipse.type = "convex",
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
+
+plot(coor_2d, col = mycols[grp_hclust])
+rgl::plot3d(coor_3d, col = mycols[grp_hclust])
+
+table(dattree[grp_hclust == 1, "Pattern"])
+table(dattree[grp_hclust == 2, "Pattern"])
+table(dattree[grp_hclust == 3, "Pattern"])
+table(dattree[grp_hclust == 4, "Pattern"])
+
+
+aggregate(. ~ grp_hclust, df, mean)
+
+
+aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
+                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
+          mean)
+
+#--------------- (2.3) DBSCAN clustering ---------------
+
+library(dbscan)
+d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
+hullplot(df, d1)
+
+grp_db <- d1$cluster
+table(grp_db)
+
+kNNdistplot(df, k = ncol(df))
+abline(h = 0.2, col = "red")
+abline(h = 0.06, col = "red")
+
+fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
+             #palette = mycols,
+             ellipse.type = "convex",
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
+
+mycols <- c("black", mycols)
+
+plot(coor_2d, col = mycols[grp_db + 1])
+rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
+
+aggregate(. ~ grp_db, df, mean)
+
+table(dattree[grp_db == 0, "Pattern"])
+table(dattree[grp_db == 1, "Pattern"])
+table(dattree[grp_db == 2, "Pattern"])
+table(dattree[grp_db == 3, "Pattern"])
+
+# Does not really work with these features!
+
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 # Navigation types by Bousbia et al. (2010):
 # - Overviewing: this value is close to the Canter “scanning” value. It
@ -181,26 +347,28 @@ heatmap(cor_mat)
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration

 dattree <- data.frame(case = datcase$case,
-                      Duration = datcase$duration,
+                      #Duration = datcase$duration,
                      PropItems = datcase$nitems / length(unique(dat$item)),
-                      PropTopic = datcase$nopenTopic / datcase$nflipCard,
-                      PropPopup = datcase$nopenPopup / datcase$nopenTopic,
+                      #PropTopic = datcase$nopenTopic / datcase$nflipCard,
+                      #PropPopup = datcase$nopenPopup / datcase$nopenTopic,
                      # SearchInfo = 
                      #   2*(((datcase$nopenPopup / datcase$nopenTopic) *
                      #       (datcase$nopenTopic / datcase$nflipCard)) /
                      # ((datcase$nopenPopup / datcase$nopenTopic) +
                      #  (datcase$nopenTopic / datcase$nflipCard))
                      # ),
+                      SearchInfo = datcase$nopenTopic / datcase$nflipCard +
+                        datcase$nopenPopup / datcase$nopenTopic,
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )

 #dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
-#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
+dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)

-dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
-dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
+#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
+#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)

 get_centrality <- function(case, data) {

@ -227,9 +395,10 @@ centrality <- do.call(rbind, centrality)

 save(centrality, file = "results/haum/tmp_centrality.RData")

-dattree$centr_degree <- centrality[, 1]
-dattree$centr_degree_loops <- centrality[, 2]
-dattree$centr_between <- centrality[, 3]
+#dattree$centr_degree <- centrality[, 1]
+#dattree$centr_degree_loops <- centrality[, 2]
+dattree$DegreeCentrality <- centrality[, 2]
+#dattree$BetweenCentrality <- centrality[, 3]

 ## Add average duration per item

@ -247,7 +416,7 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration

 rm(tmp)

-plot(dattree)
+plot(dattree[, -1], pch = ".")

 par(mfrow = c(3,4))
 hist(dattree$Duration, breaks = 50, main = "")
@ -268,13 +437,13 @@ diag(cor_mat) <- NA
 heatmap(cor_mat)


-dattree$Pattern <- "Dispersion"
-dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
-                          dattree$Singularity > 0.8, "Scholar",
-                          dattree$Pattern)
-dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
-                          dattree$centr_between > 0.5, "Star",
-                          dattree$Pattern)
+# dattree$Pattern <- "Dispersion"
+# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
+#                           dattree$Singularity > 0.8, "Scholar",
+#                           dattree$Pattern)
+# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
+#                           dattree$centr_between > 0.5, "Star",
+#                           dattree$Pattern)

 write.table(dattree,
            file = "results/haum/dattree.csv",
@ -302,11 +471,22 @@ df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
  (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))


+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+df <- dattree[, -1]
+df$AvDurItem <- normalize(df$AvDurItem)
+df$SearchInfo <- normalize(df$SearchInfo)
+df$InfocardOnly <- datcase$infocardOnly
+summary(df)
+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
 # "Flatten" with PCA
 pc <- prcomp(df)
 coor_2d <- as.data.frame(pc$x[, c(1, 2)])
 coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])

+plot(coor_2d)
+rgl::plot3d(coor_3d)
+
 #--------------- (2.1) K-Means clustering ---------------

 mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
@ -326,10 +506,11 @@ plot(coor_2d, col = mycols[grp_km])

 rgl::plot3d(coor_3d, col = mycols[grp_km])

+aggregate(. ~ grp_km, df, mean)
+
 #--------------- (2.2) Hierarchical clustering ---------------

 mat <- dist(df)
-# TODO: Do I need to scale all variables?

 h1 <- hclust(mat, method = "average")
 h2 <- hclust(mat, method = "complete")
@ -354,7 +535,7 @@ cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering

-hc <- h1
+hc <- h4

 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -390,7 +571,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
 #--------------- (2.3) DBSCAN clustering ---------------

 library(dbscan)
-d1 <- dbscan(df, eps = .2, minPts = 9)
+d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
 hullplot(df, d1)

 grp_db <- d1$cluster
@ -398,13 +579,18 @@ table(grp_db)

 kNNdistplot(df, k = ncol(df))
 abline(h = 0.2, col = "red")
+abline(h = 0.06, col = "red")

 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
-             palette = mycols,
+             #palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())

+mycols <- c("black", mycols)
+
+plot(coor_2d, col = mycols[grp_db + 1])
+legend("topleft", paste("Cl", 0:4), col = mycols, pch = 21)
 rgl::plot3d(coor_3d, col = mycols[grp_db + 1])

 aggregate(. ~ grp_db, df, mean)
@ -462,17 +648,24 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
 library(rpart)
 library(partykit)

-dattree_db <- dattree[grp_db != 0, ]
+dattree_db <- dattree[grp_db != 0, -1]
 dattree_db$grp <- factor(grp_db[grp_db != 0])
 dattree_db$Pattern <- factor(dattree_db$Pattern)

+c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
+plot(as.party(c1))
+
+c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
+plot(as.party(c2))
+
+
+
 c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
              Pattern, data = dattree_db, method = "class")
 plot(as.party(c1))

 # with conditional tree
-c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
-            PropMoves + Pattern, data = dattree_db, alpha = 0.5)
+c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
 plot(c2)

 c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
@ -484,8 +677,7 @@ c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
 plot(c4)

 # with excluded points
-c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
-            PropMoves + factor(Pattern), data = dattree, alpha = 1)
+c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
 plot(c5)

 # with excluded points
@ -603,3 +795,5 @@ igraph::centr_degree(inet, loops = FALSE)
 igraph::centr_betw(inet)
 igraph::centr_clo(inet)

+
+