Played around with clustering; switched to glower distances and removed kmeans clustering

2024-03-05 16:25:49 +01:00
parent ea0660817a
commit f8c1767074
1 changed files with 117 additions and 316 deletions
@@ -78,6 +78,8 @@ datcase <- datcase |>
  merge(eventtab, by = "case", all = TRUE) |>
  merge(topictab, by = "case", all = TRUE)

+rm(eventtab, topictab)
+
 datcase$ntopiccards <- aggregate(topic ~ case, dat,
                             function(x) ifelse(all(is.na(x)), NA,
                             length(na.omit(x))), na.action =
@@ -136,6 +138,7 @@ check_infocards <- function(subdata, artworks) {
  }
  as.numeric(infocard_only)
 }
+# TODO: Move to helper file

 artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]

@@ -167,208 +170,37 @@ heatmap(cor_mat)
 normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
 }
-
-
-#df <- as.data.frame(lapply(datcase[, -1], normalize))
-df <- as.data.frame(lapply(datcase[, -1], scale))
-#df <- datcase[, -1]
-
-# "Flatten" with PCA
-pc <- prcomp(df)
-coor_2d <- as.data.frame(pc$x[, c(1, 2)])
-coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
-
-plot(coor_2d)
-rgl::plot3d(coor_3d)
-
-#--------------- (2.1) K-Means clustering ---------------
-
-mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
-
-k1 <- kmeans(df, 4)
-
-grp_km <- k1$cluster
-table(grp_km)
-
-fviz_cluster(list(data = df, cluster = grp_km),
-             palette = mycols,
-             ellipse.type = "convex",
-             show.clust.cent = FALSE,
-             ggtheme = theme_bw())
-
-plot(coor_2d, col = mycols[grp_km])
-
-rgl::plot3d(coor_3d, col = mycols[grp_km])
-
-aggregate(. ~ grp_km, df, mean)
-
-#--------------- (2.2) Hierarchical clustering ---------------
-
-mat <- dist(df)
-
-h1 <- hclust(mat, method = "average")
-h2 <- hclust(mat, method = "complete")
-h3 <- hclust(mat, method = "ward.D")
-h4 <- hclust(mat, method = "ward.D2")
-h5 <- hclust(mat, method = "single")
-
-# Cophenetic Distances, for each linkage (runs quite some time!)
-c1 <- cophenetic(h1)
-c2 <- cophenetic(h2)
-c3 <- cophenetic(h3)
-c4 <- cophenetic(h4)
-c5 <- cophenetic(h5)
-
-# Correlations
-cor(mat, c1)
-cor(mat, c2)
-cor(mat, c3)
-cor(mat, c4)
-cor(mat, c5)
-
-# https://en.wikipedia.org/wiki/Cophenetic_correlation
-# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
-
-hc <- h4
-
-# Something like a scree plot (??)
-plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
-
-k <- 4
-
-grp_hclust <- cutree(hc, k = k)
-
-table(grp_hclust)
-
-fviz_cluster(list(data = df, cluster = grp_hclust),
-             palette = mycols,
-             ellipse.type = "convex",
-             show.clust.cent = FALSE,
-             ggtheme = theme_bw())
-
-plot(coor_2d, col = mycols[grp_hclust])
-rgl::plot3d(coor_3d, col = mycols[grp_hclust])
-
-table(dattree[grp_hclust == 1, "Pattern"])
-table(dattree[grp_hclust == 2, "Pattern"])
-table(dattree[grp_hclust == 3, "Pattern"])
-table(dattree[grp_hclust == 4, "Pattern"])
-
-
-aggregate(. ~ grp_hclust, df, mean)
-
-
-aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
-                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
-          mean)
-
-#--------------- (2.3) DBSCAN clustering ---------------
-
-library(dbscan)
-d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
-hullplot(df, d1)
-
-grp_db <- d1$cluster
-table(grp_db)
-
-kNNdistplot(df, k = ncol(df))
-abline(h = 0.2, col = "red")
-abline(h = 0.06, col = "red")
-
-fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
-             #palette = mycols,
-             ellipse.type = "convex",
-             show.clust.cent = FALSE,
-             ggtheme = theme_bw())
-
-mycols <- c("black", mycols)
-
-plot(coor_2d, col = mycols[grp_db + 1])
-rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
-
-aggregate(. ~ grp_db, df, mean)
-
-table(dattree[grp_db == 0, "Pattern"])
-table(dattree[grp_db == 1, "Pattern"])
-table(dattree[grp_db == 2, "Pattern"])
-table(dattree[grp_db == 3, "Pattern"])
-
-# Does not really work with these features!
-
-
-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-# Navigation types by Bousbia et al. (2010):
-# - Overviewing: this value is close to the Canter “scanning” value. It
-#   implies that the learner is covering a large proportion of course pages.
-#   Through this phase of fast-reading, the user seeks to acquire an
-#   overall view of the course.
-# - Flitting: close to “wandering”. It reflects a browsing activity without a
-#   strategy or a particular goal. The main difference with the overviewing
-#   type is the lack of focus on the course.
-# - Studying: corresponds to a partial or complete reading of the course
-#   pages where the learner spends time on each page.
-# - Deepening: This describes a learner who spends relatively long time on a
-#   course, checking details, and seeking Web documents related to the course
-#   topics. The main difference with studying is the Web search part that the
-#   learner uses to obtain a deeper understanding of the course.
-
-# Taxonomy defined by Canter et al. (1985):
-# - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia)
-#   by requesting an important proportion of its pages but without spending
-#   much time on them.
-# - Browsing: going wherever the data leads the navigator until catching an
-#   interest.
-# - Exploring: reading the viewed pages thoroughly.
-# - Searching: seeking for a particular document or information.
-# - Wandering: navigating in an unstructured fashion without any particular
-#   goal or strategy.
+# TODO: Move to helper file

 # Features for navigation types for MTT:
 # - Scanning / Overviewing:
-#   * Proportion of artworks looked at is high: datcase$nitems / 70
+#   * Proportion of artworks looked at is high
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 # - Exploring:
-#   * Looking at additional information for most items touched (high value):
-#   harmonic mean of datcase$nopenTopic / datcase$nflipCard and
-#   datcase$nopenPopup / datcase$nopenTopic
+#   * Looking at additional information is high
 # - Searching / Studying:
-#   * Looking only at a few items
-#   datcase$nitems / 70 is low
+#   * Proportion of artworks looked at is low
 #   * Opening few cards
 #   datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low
 #   * but for most cards popups are opened:
 #   datcase$nopenPopup / datcase$nflipCard is high
 # - Wandering / Flitting:
-#   * Items are mostly just moved:
-#   datcase$nmove / datcase$length is high
+#   * Proportion of moves is high
 #   * Duration per case is low:
 #   datcase$duration / mean(datcase$duration) or median(datcase$duration)
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration

 dattree <- data.frame(case = datcase$case,
-                      #Duration = datcase$duration,
-                      PropItems = datcase$nitems / length(unique(dat$item)),
-                      #PropTopic = datcase$nopenTopic / datcase$nflipCard,
-                      #PropPopup = datcase$nopenPopup / datcase$nopenTopic,
-                      # SearchInfo = 
-                      #   2*(((datcase$nopenPopup / datcase$nopenTopic) *
-                      #       (datcase$nopenTopic / datcase$nflipCard)) /
-                      # ((datcase$nopenPopup / datcase$nopenTopic) +
-                      #  (datcase$nopenTopic / datcase$nflipCard))
-                      # ),
-                      SearchInfo = datcase$nopenTopic / datcase$nflipCard +
-                        datcase$nopenPopup / datcase$nopenTopic,
+                      NumItems = datcase$nitems,
+                      NumTopic = datcase$nopenTopic,
+                      NumPopup = datcase$nopenPopup,
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )

-#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
-dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
-
-#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
-#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
+dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
+dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)

 get_centrality <- function(case, data) {

@@ -388,17 +220,16 @@ get_centrality <- function(case, data) {
    igraph::centr_degree(inet, loops = TRUE)$centralization,
    igraph::centr_betw(inet)$centralization)
 }
+# TODO: Move to helper file

+# centrality <- lapply(dattree$case, get_centrality, data = dat)
+# centrality <- do.call(rbind, centrality)
+#
+# save(centrality, file = "results/haum/tmp_centrality.RData")
+load("results/haum/tmp_centrality.RData")

-centrality <- lapply(dattree$case, get_centrality, data = dat)
-centrality <- do.call(rbind, centrality)
-
-save(centrality, file = "results/haum/tmp_centrality.RData")
-
-#dattree$centr_degree <- centrality[, 1]
-#dattree$centr_degree_loops <- centrality[, 2]
-dattree$DegreeCentrality <- centrality[, 2]
-#dattree$BetweenCentrality <- centrality[, 3]
+#dattree$DegreeCentrality <- centrality[, 2]
+dattree$BetweenCentrality <- centrality[, 3]

 ## Add average duration per item

@@ -413,135 +244,118 @@ tmp$duration <- tmp$max_time - tmp$min_time
 tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case

 dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
+#dattree$AvDurItem <- dattree$AvDurItem / datcase$duration

 rm(tmp)

+summary(dattree)
+
 plot(dattree[, -1], pch = ".")

-par(mfrow = c(3,4))
-hist(dattree$Duration, breaks = 50, main = "")
+par(mfrow = c(2,4))
 hist(dattree$AvDurItem, breaks = 50, main = "")
-hist(dattree$PropItems, breaks = 50, main = "")
-hist(dattree$PropTopic, breaks = 50, main = "")
-hist(dattree$PropPopup, breaks = 50, main = "")
+hist(dattree$NumItems, breaks = 50, main = "")
+hist(dattree$NumTopic, breaks = 50, main = "")
+hist(dattree$NumPopup, breaks = 50, main = "")
 hist(dattree$PropMoves, breaks = 50, main = "")
 hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
-hist(dattree$centr_degree, breaks = 50, main = "")
-hist(dattree$centr_degree_loops, breaks = 50, main = "")
-hist(dattree$centr_between, breaks = 50, main = "")
+hist(dattree$BetweenCentrality, breaks = 50, main = "")
+
+# Indicator variable if table was used as info terminal only
+dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, labels = c("no", "yes"))
+
+# Add pattern to datcase; loosely based on Bousbia et al. (2009)
+dattree$Pattern <- "Dispersion"
+dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar",
+                          dattree$Pattern)
+dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
+                          dattree$BetweenCentrality > 0.5, "Star",
+                          dattree$Pattern)
+dattree$Pattern <- factor(dattree$Pattern)


-cor_mat <- cor(dattree[, -1], use = "pairwise")
-diag(cor_mat) <- NA
-heatmap(cor_mat)
+# Remove cases with extreme outliers
+# TODO: Do I want this???

+quantile(datcase$nopenTopic, 0.999)
+quantile(datcase$nopenPopup, 0.999)

-# dattree$Pattern <- "Dispersion"
-# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
-#                           dattree$Singularity > 0.8, "Scholar",
-#                           dattree$Pattern)
-# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
-#                           dattree$centr_between > 0.5, "Star",
-#                           dattree$Pattern)
+dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ]
+
+plot(dattree[, -1], pch = ".")
+
+par(mfrow = c(2,4))
+hist(dattree$AvDurItem, breaks = 50, main = "")
+hist(dattree$NumItems, breaks = 50, main = "")
+hist(dattree$NumTopic, breaks = 50, main = "")
+hist(dattree$NumPopup, breaks = 50, main = "")
+hist(dattree$PropMoves, breaks = 50, main = "")
+hist(dattree$PathLinearity, breaks = 50, main = "")
+hist(dattree$Singularity, breaks = 50, main = "")
+hist(dattree$BetweenCentrality, breaks = 50, main = "")

-write.table(dattree,
-            file = "results/haum/dattree.csv",
-            sep = ";",
-            quote = FALSE,
-            row.names = FALSE)

 #--------------- (2) Clustering ---------------

-df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
-#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
-df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
-df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
-df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
+library(cluster)

-# scale Duration and min/max SearchInfo
-#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
-df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
-  (max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
-#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
-#  (max(df$SearchInfo) - min(df$SearchInfo))
-df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
-  (max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE))
-df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
-  (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
+df <- dattree[1:10000, -1] # remove case variable
+# TODO: Do I need to scale or does normalization also work?
+
+# Normalize Duration and Numbers
+# df$AvDurItem <- normalize(df$AvDurItem)
+# df$NumItems  <- normalize(df$NumItems)
+# df$NumTopic  <- normalize(df$NumTopic)
+# df$NumPopup  <- normalize(df$NumPopup)
+
+# summary(df)
+
+# Look at collinearity
+cor_mat <- cor(df)
+diag(cor_mat) <- NA
+heatmap(cor_mat)
+
+#df <- as.data.frame(scale(dattree[, -1]))


-#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-df <- dattree[, -1]
-df$AvDurItem <- normalize(df$AvDurItem)
-df$SearchInfo <- normalize(df$SearchInfo)
-df$InfocardOnly <- datcase$infocardOnly
-summary(df)
-#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+#--------------- (2.2) Hierarchical clustering ---------------

-# "Flatten" with PCA
-pc <- prcomp(df)
-coor_2d <- as.data.frame(pc$x[, c(1, 2)])
-coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
+mat <- daisy(df, metric = "gower")
+
+# "Flatten" with MDS
+coor_2d <- as.data.frame(cmdscale(mat, k = 2))
+coor_3d <- as.data.frame(cmdscale(mat, k = 3))

 plot(coor_2d)
 rgl::plot3d(coor_3d)

-#--------------- (2.1) K-Means clustering ---------------
+#mat <- dist(df)

-mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
+# https://uc-r.github.io/hc_clustering
+method <- c(average = "average", single = "single", complete = "complete",
+            ward = "ward.D2")

-k1 <- kmeans(df, 4)
+hc_method <- function(x) {
+  hclust(mat, method = x)
+}

-grp_km <- k1$cluster
-table(grp_km)
-
-fviz_cluster(list(data = df, cluster = grp_km),
-             palette = mycols,
-             ellipse.type = "convex",
-             show.clust.cent = FALSE,
-             ggtheme = theme_bw())
-
-plot(coor_2d, col = mycols[grp_km])
-
-rgl::plot3d(coor_3d, col = mycols[grp_km])
-
-aggregate(. ~ grp_km, df, mean)
-
-#--------------- (2.2) Hierarchical clustering ---------------
-
-mat <- dist(df)
-
-h1 <- hclust(mat, method = "average")
-h2 <- hclust(mat, method = "complete")
-h3 <- hclust(mat, method = "ward.D")
-h4 <- hclust(mat, method = "ward.D2")
-h5 <- hclust(mat, method = "single")
-
-# Cophenetic Distances, for each linkage (runs quite some time!)
-c1 <- cophenetic(h1)
-c2 <- cophenetic(h2)
-c3 <- cophenetic(h3)
-c4 <- cophenetic(h4)
-c5 <- cophenetic(h5)
-
-# Correlations
-cor(mat, c1)
-cor(mat, c2)
-cor(mat, c3)
-cor(mat, c4)
-cor(mat, c5)
+hcs <- lapply(method, hc_method)
+cds <- lapply(hcs, cophenetic)
+cors <- sapply(cds, cor, y = mat)

 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering

-hc <- h4
+hc <- hcs$average

 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)

 k <- 4

+mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
+
 grp_hclust <- cutree(hc, k = k)

 table(grp_hclust)
@@ -553,17 +367,16 @@ fviz_cluster(list(data = df, cluster = grp_hclust),
             ggtheme = theme_bw())

 plot(coor_2d, col = mycols[grp_hclust])
+legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21)
 rgl::plot3d(coor_3d, col = mycols[grp_hclust])

-table(dattree[grp_hclust == 1, "Pattern"])
-table(dattree[grp_hclust == 2, "Pattern"])
-table(dattree[grp_hclust == 3, "Pattern"])
-table(dattree[grp_hclust == 4, "Pattern"])
-
+table(datcase[grp_hclust == 1, "Pattern"])
+table(datcase[grp_hclust == 2, "Pattern"])
+table(datcase[grp_hclust == 3, "Pattern"])
+table(datcase[grp_hclust == 4, "Pattern"])

 aggregate(. ~ grp_hclust, df, mean)

-
 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
          mean)
@@ -571,7 +384,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
 #--------------- (2.3) DBSCAN clustering ---------------

 library(dbscan)
-d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
+d1 <- dbscan(df, eps = 1, minPts = ncol(df) + 1)
 hullplot(df, d1)

 grp_db <- d1$cluster
@@ -579,10 +392,10 @@ table(grp_db)

 kNNdistplot(df, k = ncol(df))
 abline(h = 0.2, col = "red")
-abline(h = 0.06, col = "red")
+abline(h = 1, col = "red")

 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
-             #palette = mycols,
+             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
@@ -595,17 +408,20 @@ rgl::plot3d(coor_3d, col = mycols[grp_db + 1])

 aggregate(. ~ grp_db, df, mean)

-table(dattree[grp_db == 0, "Pattern"])
-table(dattree[grp_db == 1, "Pattern"])
-table(dattree[grp_db == 2, "Pattern"])
-table(dattree[grp_db == 3, "Pattern"])
+table(datcase[grp_db == 0, "Pattern"])
+table(datcase[grp_db == 1, "Pattern"])
+table(datcase[grp_db == 2, "Pattern"])
+table(datcase[grp_db == 3, "Pattern"])
+table(datcase[grp_db == 4, "Pattern"])

 ### Look at selected cases ###########################################
+dattree[grp_db == 0, ]
+
 tmp <- dat
 tmp$start <- tmp$date.start
 tmp$complete <- tmp$date.stop

-alog <- activitylog(tmp[tmp$case == 30418, ],
+alog <- activitylog(tmp[tmp$case == 15, ],
                    case_id     = "case",
                    activity_id = "item",
                    resource_id = "path",
@@ -650,38 +466,23 @@ library(partykit)

 dattree_db <- dattree[grp_db != 0, -1]
 dattree_db$grp <- factor(grp_db[grp_db != 0])
-dattree_db$Pattern <- factor(dattree_db$Pattern)

 c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
 plot(as.party(c1))

-c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
+c2 <- rpart(as.factor(grp_hclust) ~ ., data = dattree[, -1], method = "class")
 plot(as.party(c2))

-
-
-c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
-              Pattern, data = dattree_db, method = "class")
-plot(as.party(c1))
-
 # with conditional tree
-c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
+c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.05)
 plot(c2)

-c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
-            PropMoves + Pattern, data = dattree_db, alpha = 0)
-plot(c3)
-
-c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
-            PropMoves + Pattern, data = dattree_db, alpha = 1)
-plot(c4)
-
 # with excluded points
-c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
+c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0)
 plot(c5)

 # with excluded points
-c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
+c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 0)
 plot(c6)
 # --> just checking