Looked at item clustering again
This commit is contained in:
parent
f9f8086486
commit
d78a224851
@ -34,6 +34,8 @@ dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
||||
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
||||
"openPopup"))
|
||||
|
||||
# TODO: Maybe look at this with complete data?
|
||||
|
||||
# Select data pre Corona
|
||||
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
||||
dat <- dat[dat$path != 106098, ]
|
||||
@ -63,29 +65,77 @@ df <- datitem[, c("precision", "generalizability", "nvariants",
|
||||
scale()
|
||||
mat <- dist(df)
|
||||
|
||||
hc <- hclust(mat, method = "ward.D2")
|
||||
heatmap(as.matrix(mat))
|
||||
|
||||
grp <- cutree(hc, k = 6)
|
||||
# Choosing best linkage method
|
||||
h1 <- hclust(mat, method = "average")
|
||||
h2 <- hclust(mat, method = "complete")
|
||||
h3 <- hclust(mat, method = "ward.D")
|
||||
h4 <- hclust(mat, method = "ward.D2")
|
||||
h5 <- hclust(mat, method = "single")
|
||||
|
||||
# Cophenetic Distances, for each linkage
|
||||
c1 <- cophenetic(h1)
|
||||
c2 <- cophenetic(h2)
|
||||
c3 <- cophenetic(h3)
|
||||
c4 <- cophenetic(h4)
|
||||
c5 <- cophenetic(h5)
|
||||
|
||||
# Correlations
|
||||
cor(mat, c1)
|
||||
cor(mat, c2)
|
||||
cor(mat, c3)
|
||||
cor(mat, c4)
|
||||
cor(mat, c5)
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
|
||||
# Dendograms
|
||||
par(mfrow=c(3,2))
|
||||
plot(h1, main = "Average Linkage")
|
||||
plot(h2, main = "Complete Linkage")
|
||||
plot(h3, main = "Ward Linkage")
|
||||
plot(h4, main = "Ward 2 Linkage")
|
||||
plot(h5, main = "Single Linkage")
|
||||
|
||||
|
||||
hc <- h1
|
||||
# Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’
|
||||
|
||||
k <- 7 # number of clusters
|
||||
|
||||
grp <- cutree(hc, k = k)
|
||||
datitem$grp <- grp
|
||||
|
||||
fviz_dend(hc, k = 6,
|
||||
fviz_dend(hc, k = k,
|
||||
cex = 0.5,
|
||||
k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
||||
"#FF6900", "#434F4F"),
|
||||
"#FF6900", "gold", "#434F4F"),
|
||||
#type = "phylogenic",
|
||||
rect = TRUE
|
||||
)
|
||||
|
||||
plot(hc)
|
||||
rect.hclust(hc, k=8, border="red")
|
||||
rect.hclust(hc, k=7, border="blue")
|
||||
rect.hclust(hc, k=6, border="green")
|
||||
|
||||
p <- fviz_cluster(list(data = df, cluster = grp),
|
||||
palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
||||
"#FF6900", "#434F4F"),
|
||||
"#FF6900", "#434F4F", "gold"),
|
||||
ellipse.type = "convex",
|
||||
repel = TRUE,
|
||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
||||
p
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
||||
ncases, ntopics) ~ grp, datitem, mean)
|
||||
ncases, ntopics) ~ grp, datitem, median)
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(seq_along(hc$height)), hc$height, type = "l")
|
||||
points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5)
|
||||
|
||||
|
||||
|
||||
|
||||
datitem$item <- sprintf("%03d",
|
||||
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
|
||||
@ -94,6 +144,7 @@ res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
|
||||
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||
|
||||
# Look at clusters
|
||||
par(mfrow = c(2,2))
|
||||
vioplot::vioplot(duration ~ grp, res)
|
||||
vioplot::vioplot(distance ~ grp, res)
|
||||
vioplot::vioplot(scaleSize ~ grp, res)
|
||||
|
Loading…
Reference in New Issue
Block a user