Looked at item clustering again

This commit is contained in:
Nora Wickelmaier 2024-02-07 18:00:25 +01:00
parent f9f8086486
commit d78a224851

View File

@ -34,6 +34,8 @@ dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
"openPopup"))
# TODO: Maybe look at this with complete data?
# Select data pre Corona
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
dat <- dat[dat$path != 106098, ]
@ -63,29 +65,77 @@ df <- datitem[, c("precision", "generalizability", "nvariants",
scale()
mat <- dist(df)
hc <- hclust(mat, method = "ward.D2")
heatmap(as.matrix(mat))
grp <- cutree(hc, k = 6)
# Choosing best linkage method
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# Dendograms
par(mfrow=c(3,2))
plot(h1, main = "Average Linkage")
plot(h2, main = "Complete Linkage")
plot(h3, main = "Ward Linkage")
plot(h4, main = "Ward 2 Linkage")
plot(h5, main = "Single Linkage")
hc <- h1
# Note that agnes(*, method="ward") corresponds to hclust(*, "ward.D2")
k <- 7 # number of clusters
grp <- cutree(hc, k = k)
datitem$grp <- grp
fviz_dend(hc, k = 6,
fviz_dend(hc, k = k,
cex = 0.5,
k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
"#FF6900", "#434F4F"),
"#FF6900", "gold", "#434F4F"),
#type = "phylogenic",
rect = TRUE
)
plot(hc)
rect.hclust(hc, k=8, border="red")
rect.hclust(hc, k=7, border="blue")
rect.hclust(hc, k=6, border="green")
p <- fviz_cluster(list(data = df, cluster = grp),
palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
"#FF6900", "#434F4F"),
"#FF6900", "#434F4F", "gold"),
ellipse.type = "convex",
repel = TRUE,
show.clust.cent = FALSE, ggtheme = theme_bw())
p
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
ncases, ntopics) ~ grp, datitem, mean)
ncases, ntopics) ~ grp, datitem, median)
# Something like a scree plot (??)
plot(rev(seq_along(hc$height)), hc$height, type = "l")
points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5)
datitem$item <- sprintf("%03d",
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
@ -94,6 +144,7 @@ res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
# Look at clusters
par(mfrow = c(2,2))
vioplot::vioplot(duration ~ grp, res)
vioplot::vioplot(distance ~ grp, res)
vioplot::vioplot(scaleSize ~ grp, res)