diff --git a/code/08_item-clustering.R b/code/08_item-clustering.R index d901a61..aab14be 100644 --- a/code/08_item-clustering.R +++ b/code/08_item-clustering.R @@ -34,6 +34,8 @@ dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", "openPopup")) +# TODO: Maybe look at this with complete data? + # Select data pre Corona dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] dat <- dat[dat$path != 106098, ] @@ -63,29 +65,77 @@ df <- datitem[, c("precision", "generalizability", "nvariants", scale() mat <- dist(df) -hc <- hclust(mat, method = "ward.D2") +heatmap(as.matrix(mat)) -grp <- cutree(hc, k = 6) +# Choosing best linkage method +h1 <- hclust(mat, method = "average") +h2 <- hclust(mat, method = "complete") +h3 <- hclust(mat, method = "ward.D") +h4 <- hclust(mat, method = "ward.D2") +h5 <- hclust(mat, method = "single") + +# Cophenetic Distances, for each linkage +c1 <- cophenetic(h1) +c2 <- cophenetic(h2) +c3 <- cophenetic(h3) +c4 <- cophenetic(h4) +c5 <- cophenetic(h5) + +# Correlations +cor(mat, c1) +cor(mat, c2) +cor(mat, c3) +cor(mat, c4) +cor(mat, c5) +# https://en.wikipedia.org/wiki/Cophenetic_correlation + +# Dendograms +par(mfrow=c(3,2)) +plot(h1, main = "Average Linkage") +plot(h2, main = "Complete Linkage") +plot(h3, main = "Ward Linkage") +plot(h4, main = "Ward 2 Linkage") +plot(h5, main = "Single Linkage") + + +hc <- h1 +# Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’ + +k <- 7 # number of clusters + +grp <- cutree(hc, k = k) datitem$grp <- grp -fviz_dend(hc, k = 6, +fviz_dend(hc, k = k, cex = 0.5, k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E", - "#FF6900", "#434F4F"), + "#FF6900", "gold", "#434F4F"), #type = "phylogenic", rect = TRUE ) +plot(hc) +rect.hclust(hc, k=8, border="red") +rect.hclust(hc, k=7, border="blue") +rect.hclust(hc, k=6, border="green") + p <- fviz_cluster(list(data = df, cluster = grp), palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E", - "#FF6900", "#434F4F"), + "#FF6900", "#434F4F", "gold"), ellipse.type = "convex", repel = TRUE, show.clust.cent = FALSE, ggtheme = theme_bw()) p aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, - ncases, ntopics) ~ grp, datitem, mean) + ncases, ntopics) ~ grp, datitem, median) + +# Something like a scree plot (??) +plot(rev(seq_along(hc$height)), hc$height, type = "l") +points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5) + + + datitem$item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem)))) @@ -94,6 +144,7 @@ res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] # Look at clusters +par(mfrow = c(2,2)) vioplot::vioplot(duration ~ grp, res) vioplot::vioplot(distance ~ grp, res) vioplot::vioplot(scaleSize ~ grp, res)