Looked at item clustering again
This commit is contained in:
parent
f9f8086486
commit
d78a224851
@ -34,6 +34,8 @@ dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
|||||||
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
||||||
"openPopup"))
|
"openPopup"))
|
||||||
|
|
||||||
|
# TODO: Maybe look at this with complete data?
|
||||||
|
|
||||||
# Select data pre Corona
|
# Select data pre Corona
|
||||||
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
||||||
dat <- dat[dat$path != 106098, ]
|
dat <- dat[dat$path != 106098, ]
|
||||||
@ -63,29 +65,77 @@ df <- datitem[, c("precision", "generalizability", "nvariants",
|
|||||||
scale()
|
scale()
|
||||||
mat <- dist(df)
|
mat <- dist(df)
|
||||||
|
|
||||||
hc <- hclust(mat, method = "ward.D2")
|
heatmap(as.matrix(mat))
|
||||||
|
|
||||||
grp <- cutree(hc, k = 6)
|
# Choosing best linkage method
|
||||||
|
h1 <- hclust(mat, method = "average")
|
||||||
|
h2 <- hclust(mat, method = "complete")
|
||||||
|
h3 <- hclust(mat, method = "ward.D")
|
||||||
|
h4 <- hclust(mat, method = "ward.D2")
|
||||||
|
h5 <- hclust(mat, method = "single")
|
||||||
|
|
||||||
|
# Cophenetic Distances, for each linkage
|
||||||
|
c1 <- cophenetic(h1)
|
||||||
|
c2 <- cophenetic(h2)
|
||||||
|
c3 <- cophenetic(h3)
|
||||||
|
c4 <- cophenetic(h4)
|
||||||
|
c5 <- cophenetic(h5)
|
||||||
|
|
||||||
|
# Correlations
|
||||||
|
cor(mat, c1)
|
||||||
|
cor(mat, c2)
|
||||||
|
cor(mat, c3)
|
||||||
|
cor(mat, c4)
|
||||||
|
cor(mat, c5)
|
||||||
|
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||||
|
|
||||||
|
# Dendograms
|
||||||
|
par(mfrow=c(3,2))
|
||||||
|
plot(h1, main = "Average Linkage")
|
||||||
|
plot(h2, main = "Complete Linkage")
|
||||||
|
plot(h3, main = "Ward Linkage")
|
||||||
|
plot(h4, main = "Ward 2 Linkage")
|
||||||
|
plot(h5, main = "Single Linkage")
|
||||||
|
|
||||||
|
|
||||||
|
hc <- h1
|
||||||
|
# Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’
|
||||||
|
|
||||||
|
k <- 7 # number of clusters
|
||||||
|
|
||||||
|
grp <- cutree(hc, k = k)
|
||||||
datitem$grp <- grp
|
datitem$grp <- grp
|
||||||
|
|
||||||
fviz_dend(hc, k = 6,
|
fviz_dend(hc, k = k,
|
||||||
cex = 0.5,
|
cex = 0.5,
|
||||||
k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
||||||
"#FF6900", "#434F4F"),
|
"#FF6900", "gold", "#434F4F"),
|
||||||
#type = "phylogenic",
|
#type = "phylogenic",
|
||||||
rect = TRUE
|
rect = TRUE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
plot(hc)
|
||||||
|
rect.hclust(hc, k=8, border="red")
|
||||||
|
rect.hclust(hc, k=7, border="blue")
|
||||||
|
rect.hclust(hc, k=6, border="green")
|
||||||
|
|
||||||
p <- fviz_cluster(list(data = df, cluster = grp),
|
p <- fviz_cluster(list(data = df, cluster = grp),
|
||||||
palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
||||||
"#FF6900", "#434F4F"),
|
"#FF6900", "#434F4F", "gold"),
|
||||||
ellipse.type = "convex",
|
ellipse.type = "convex",
|
||||||
repel = TRUE,
|
repel = TRUE,
|
||||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
show.clust.cent = FALSE, ggtheme = theme_bw())
|
||||||
p
|
p
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
||||||
ncases, ntopics) ~ grp, datitem, mean)
|
ncases, ntopics) ~ grp, datitem, median)
|
||||||
|
|
||||||
|
# Something like a scree plot (??)
|
||||||
|
plot(rev(seq_along(hc$height)), hc$height, type = "l")
|
||||||
|
points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
datitem$item <- sprintf("%03d",
|
datitem$item <- sprintf("%03d",
|
||||||
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
|
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
|
||||||
@ -94,6 +144,7 @@ res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
|
|||||||
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||||
|
|
||||||
# Look at clusters
|
# Look at clusters
|
||||||
|
par(mfrow = c(2,2))
|
||||||
vioplot::vioplot(duration ~ grp, res)
|
vioplot::vioplot(duration ~ grp, res)
|
||||||
vioplot::vioplot(distance ~ grp, res)
|
vioplot::vioplot(distance ~ grp, res)
|
||||||
vioplot::vioplot(scaleSize ~ grp, res)
|
vioplot::vioplot(scaleSize ~ grp, res)
|
||||||
|
Loading…
Reference in New Issue
Block a user