Played around with clustering; switched to glower distances and removed kmeans clustering

This commit is contained in:
Nora Wickelmaier 2024-03-05 16:25:49 +01:00
parent ea0660817a
commit f8c1767074

View File

@ -74,10 +74,12 @@ topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6]
topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7]
topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8]
datcase <- datcase |>
datcase <- datcase |>
merge(eventtab, by = "case", all = TRUE) |>
merge(topictab, by = "case", all = TRUE)
rm(eventtab, topictab)
datcase$ntopiccards <- aggregate(topic ~ case, dat,
function(x) ifelse(all(is.na(x)), NA,
length(na.omit(x))), na.action =
@ -136,6 +138,7 @@ check_infocards <- function(subdata, artworks) {
}
as.numeric(infocard_only)
}
# TODO: Move to helper file
artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
@ -167,214 +170,43 @@ heatmap(cor_mat)
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
#df <- as.data.frame(lapply(datcase[, -1], normalize))
df <- as.data.frame(lapply(datcase[, -1], scale))
#df <- datcase[, -1]
# "Flatten" with PCA
pc <- prcomp(df)
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
plot(coor_2d)
rgl::plot3d(coor_3d)
#--------------- (2.1) K-Means clustering ---------------
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
k1 <- kmeans(df, 4)
grp_km <- k1$cluster
table(grp_km)
fviz_cluster(list(data = df, cluster = grp_km),
palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
plot(coor_2d, col = mycols[grp_km])
rgl::plot3d(coor_3d, col = mycols[grp_km])
aggregate(. ~ grp_km, df, mean)
#--------------- (2.2) Hierarchical clustering ---------------
mat <- dist(df)
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage (runs quite some time!)
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h4
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
k <- 4
grp_hclust <- cutree(hc, k = k)
table(grp_hclust)
fviz_cluster(list(data = df, cluster = grp_hclust),
palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
plot(coor_2d, col = mycols[grp_hclust])
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
table(dattree[grp_hclust == 1, "Pattern"])
table(dattree[grp_hclust == 2, "Pattern"])
table(dattree[grp_hclust == 3, "Pattern"])
table(dattree[grp_hclust == 4, "Pattern"])
aggregate(. ~ grp_hclust, df, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
mean)
#--------------- (2.3) DBSCAN clustering ---------------
library(dbscan)
d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
hullplot(df, d1)
grp_db <- d1$cluster
table(grp_db)
kNNdistplot(df, k = ncol(df))
abline(h = 0.2, col = "red")
abline(h = 0.06, col = "red")
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
#palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
mycols <- c("black", mycols)
plot(coor_2d, col = mycols[grp_db + 1])
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
aggregate(. ~ grp_db, df, mean)
table(dattree[grp_db == 0, "Pattern"])
table(dattree[grp_db == 1, "Pattern"])
table(dattree[grp_db == 2, "Pattern"])
table(dattree[grp_db == 3, "Pattern"])
# Does not really work with these features!
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Navigation types by Bousbia et al. (2010):
# - Overviewing: this value is close to the Canter “scanning” value. It
# implies that the learner is covering a large proportion of course pages.
# Through this phase of fast-reading, the user seeks to acquire an
# overall view of the course.
# - Flitting: close to “wandering”. It reflects a browsing activity without a
# strategy or a particular goal. The main difference with the overviewing
# type is the lack of focus on the course.
# - Studying: corresponds to a partial or complete reading of the course
# pages where the learner spends time on each page.
# - Deepening: This describes a learner who spends relatively long time on a
# course, checking details, and seeking Web documents related to the course
# topics. The main difference with studying is the Web search part that the
# learner uses to obtain a deeper understanding of the course.
# Taxonomy defined by Canter et al. (1985):
# - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia)
# by requesting an important proportion of its pages but without spending
# much time on them.
# - Browsing: going wherever the data leads the navigator until catching an
# interest.
# - Exploring: reading the viewed pages thoroughly.
# - Searching: seeking for a particular document or information.
# - Wandering: navigating in an unstructured fashion without any particular
# goal or strategy.
# TODO: Move to helper file
# Features for navigation types for MTT:
# - Scanning / Overviewing:
# * Proportion of artworks looked at is high: datcase$nitems / 70
# * Proportion of artworks looked at is high
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
# - Exploring:
# * Looking at additional information for most items touched (high value):
# harmonic mean of datcase$nopenTopic / datcase$nflipCard and
# datcase$nopenPopup / datcase$nopenTopic
# * Looking at additional information is high
# - Searching / Studying:
# * Looking only at a few items
# datcase$nitems / 70 is low
# * Proportion of artworks looked at is low
# * Opening few cards
# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low
# * but for most cards popups are opened:
# datcase$nopenPopup / datcase$nflipCard is high
# - Wandering / Flitting:
# * Items are mostly just moved:
# datcase$nmove / datcase$length is high
# * Proportion of moves is high
# * Duration per case is low:
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
dattree <- data.frame(case = datcase$case,
#Duration = datcase$duration,
PropItems = datcase$nitems / length(unique(dat$item)),
#PropTopic = datcase$nopenTopic / datcase$nflipCard,
#PropPopup = datcase$nopenPopup / datcase$nopenTopic,
# SearchInfo =
# 2*(((datcase$nopenPopup / datcase$nopenTopic) *
# (datcase$nopenTopic / datcase$nflipCard)) /
# ((datcase$nopenPopup / datcase$nopenTopic) +
# (datcase$nopenTopic / datcase$nflipCard))
# ),
SearchInfo = datcase$nopenTopic / datcase$nflipCard +
datcase$nopenPopup / datcase$nopenTopic,
NumItems = datcase$nitems,
NumTopic = datcase$nopenTopic,
NumPopup = datcase$nopenPopup,
PropMoves = datcase$nmove / datcase$length,
PathLinearity = datcase$nitems / datcase$npaths,
Singularity = datcase$npaths / datcase$length
)
#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
get_centrality <- function(case, data) {
data$start <- data$date.start
data$complete <- data$date.stop
alog <- activitylog(data[data$case == case, ],
case_id = "case",
activity_id = "item",
@ -388,17 +220,16 @@ get_centrality <- function(case, data) {
igraph::centr_degree(inet, loops = TRUE)$centralization,
igraph::centr_betw(inet)$centralization)
}
# TODO: Move to helper file
# centrality <- lapply(dattree$case, get_centrality, data = dat)
# centrality <- do.call(rbind, centrality)
#
# save(centrality, file = "results/haum/tmp_centrality.RData")
load("results/haum/tmp_centrality.RData")
centrality <- lapply(dattree$case, get_centrality, data = dat)
centrality <- do.call(rbind, centrality)
save(centrality, file = "results/haum/tmp_centrality.RData")
#dattree$centr_degree <- centrality[, 1]
#dattree$centr_degree_loops <- centrality[, 2]
dattree$DegreeCentrality <- centrality[, 2]
#dattree$BetweenCentrality <- centrality[, 3]
#dattree$DegreeCentrality <- centrality[, 2]
dattree$BetweenCentrality <- centrality[, 3]
## Add average duration per item
@ -413,135 +244,118 @@ tmp$duration <- tmp$max_time - tmp$min_time
tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case
dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
#dattree$AvDurItem <- dattree$AvDurItem / datcase$duration
rm(tmp)
summary(dattree)
plot(dattree[, -1], pch = ".")
par(mfrow = c(3,4))
hist(dattree$Duration, breaks = 50, main = "")
par(mfrow = c(2,4))
hist(dattree$AvDurItem, breaks = 50, main = "")
hist(dattree$PropItems, breaks = 50, main = "")
hist(dattree$PropTopic, breaks = 50, main = "")
hist(dattree$PropPopup, breaks = 50, main = "")
hist(dattree$NumItems, breaks = 50, main = "")
hist(dattree$NumTopic, breaks = 50, main = "")
hist(dattree$NumPopup, breaks = 50, main = "")
hist(dattree$PropMoves, breaks = 50, main = "")
hist(dattree$PathLinearity, breaks = 50, main = "")
hist(dattree$Singularity, breaks = 50, main = "")
hist(dattree$centr_degree, breaks = 50, main = "")
hist(dattree$centr_degree_loops, breaks = 50, main = "")
hist(dattree$centr_between, breaks = 50, main = "")
hist(dattree$BetweenCentrality, breaks = 50, main = "")
# Indicator variable if table was used as info terminal only
dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, labels = c("no", "yes"))
# Add pattern to datcase; loosely based on Bousbia et al. (2009)
dattree$Pattern <- "Dispersion"
dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar",
dattree$Pattern)
dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
dattree$BetweenCentrality > 0.5, "Star",
dattree$Pattern)
dattree$Pattern <- factor(dattree$Pattern)
cor_mat <- cor(dattree[, -1], use = "pairwise")
diag(cor_mat) <- NA
heatmap(cor_mat)
# Remove cases with extreme outliers
# TODO: Do I want this???
quantile(datcase$nopenTopic, 0.999)
quantile(datcase$nopenPopup, 0.999)
# dattree$Pattern <- "Dispersion"
# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
# dattree$Singularity > 0.8, "Scholar",
# dattree$Pattern)
# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
# dattree$centr_between > 0.5, "Star",
# dattree$Pattern)
dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ]
plot(dattree[, -1], pch = ".")
par(mfrow = c(2,4))
hist(dattree$AvDurItem, breaks = 50, main = "")
hist(dattree$NumItems, breaks = 50, main = "")
hist(dattree$NumTopic, breaks = 50, main = "")
hist(dattree$NumPopup, breaks = 50, main = "")
hist(dattree$PropMoves, breaks = 50, main = "")
hist(dattree$PathLinearity, breaks = 50, main = "")
hist(dattree$Singularity, breaks = 50, main = "")
hist(dattree$BetweenCentrality, breaks = 50, main = "")
write.table(dattree,
file = "results/haum/dattree.csv",
sep = ";",
quote = FALSE,
row.names = FALSE)
#--------------- (2) Clustering ---------------
df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
library(cluster)
# scale Duration and min/max SearchInfo
#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
(max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
# (max(df$SearchInfo) - min(df$SearchInfo))
df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
(max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE))
df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
df <- dattree[1:10000, -1] # remove case variable
# TODO: Do I need to scale or does normalization also work?
# Normalize Duration and Numbers
# df$AvDurItem <- normalize(df$AvDurItem)
# df$NumItems <- normalize(df$NumItems)
# df$NumTopic <- normalize(df$NumTopic)
# df$NumPopup <- normalize(df$NumPopup)
# summary(df)
# Look at collinearity
cor_mat <- cor(df)
diag(cor_mat) <- NA
heatmap(cor_mat)
#df <- as.data.frame(scale(dattree[, -1]))
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
df <- dattree[, -1]
df$AvDurItem <- normalize(df$AvDurItem)
df$SearchInfo <- normalize(df$SearchInfo)
df$InfocardOnly <- datcase$infocardOnly
summary(df)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#--------------- (2.2) Hierarchical clustering ---------------
# "Flatten" with PCA
pc <- prcomp(df)
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
mat <- daisy(df, metric = "gower")
# "Flatten" with MDS
coor_2d <- as.data.frame(cmdscale(mat, k = 2))
coor_3d <- as.data.frame(cmdscale(mat, k = 3))
plot(coor_2d)
rgl::plot3d(coor_3d)
#--------------- (2.1) K-Means clustering ---------------
#mat <- dist(df)
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
# https://uc-r.github.io/hc_clustering
method <- c(average = "average", single = "single", complete = "complete",
ward = "ward.D2")
k1 <- kmeans(df, 4)
hc_method <- function(x) {
hclust(mat, method = x)
}
grp_km <- k1$cluster
table(grp_km)
fviz_cluster(list(data = df, cluster = grp_km),
palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
plot(coor_2d, col = mycols[grp_km])
rgl::plot3d(coor_3d, col = mycols[grp_km])
aggregate(. ~ grp_km, df, mean)
#--------------- (2.2) Hierarchical clustering ---------------
mat <- dist(df)
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage (runs quite some time!)
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
hcs <- lapply(method, hc_method)
cds <- lapply(hcs, cophenetic)
cors <- sapply(cds, cor, y = mat)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h4
hc <- hcs$average
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
k <- 4
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
grp_hclust <- cutree(hc, k = k)
table(grp_hclust)
@ -553,17 +367,16 @@ fviz_cluster(list(data = df, cluster = grp_hclust),
ggtheme = theme_bw())
plot(coor_2d, col = mycols[grp_hclust])
legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21)
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
table(dattree[grp_hclust == 1, "Pattern"])
table(dattree[grp_hclust == 2, "Pattern"])
table(dattree[grp_hclust == 3, "Pattern"])
table(dattree[grp_hclust == 4, "Pattern"])
table(datcase[grp_hclust == 1, "Pattern"])
table(datcase[grp_hclust == 2, "Pattern"])
table(datcase[grp_hclust == 3, "Pattern"])
table(datcase[grp_hclust == 4, "Pattern"])
aggregate(. ~ grp_hclust, df, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
mean)
@ -571,7 +384,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
#--------------- (2.3) DBSCAN clustering ---------------
library(dbscan)
d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
d1 <- dbscan(df, eps = 1, minPts = ncol(df) + 1)
hullplot(df, d1)
grp_db <- d1$cluster
@ -579,10 +392,10 @@ table(grp_db)
kNNdistplot(df, k = ncol(df))
abline(h = 0.2, col = "red")
abline(h = 0.06, col = "red")
abline(h = 1, col = "red")
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
#palette = mycols,
palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
@ -595,17 +408,20 @@ rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
aggregate(. ~ grp_db, df, mean)
table(dattree[grp_db == 0, "Pattern"])
table(dattree[grp_db == 1, "Pattern"])
table(dattree[grp_db == 2, "Pattern"])
table(dattree[grp_db == 3, "Pattern"])
table(datcase[grp_db == 0, "Pattern"])
table(datcase[grp_db == 1, "Pattern"])
table(datcase[grp_db == 2, "Pattern"])
table(datcase[grp_db == 3, "Pattern"])
table(datcase[grp_db == 4, "Pattern"])
### Look at selected cases ###########################################
dattree[grp_db == 0, ]
tmp <- dat
tmp$start <- tmp$date.start
tmp$complete <- tmp$date.stop
alog <- activitylog(tmp[tmp$case == 30418, ],
alog <- activitylog(tmp[tmp$case == 15, ],
case_id = "case",
activity_id = "item",
resource_id = "path",
@ -650,38 +466,23 @@ library(partykit)
dattree_db <- dattree[grp_db != 0, -1]
dattree_db$grp <- factor(grp_db[grp_db != 0])
dattree_db$Pattern <- factor(dattree_db$Pattern)
c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
plot(as.party(c1))
c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
c2 <- rpart(as.factor(grp_hclust) ~ ., data = dattree[, -1], method = "class")
plot(as.party(c2))
c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
Pattern, data = dattree_db, method = "class")
plot(as.party(c1))
# with conditional tree
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.05)
plot(c2)
c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
PropMoves + Pattern, data = dattree_db, alpha = 0)
plot(c3)
c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
PropMoves + Pattern, data = dattree_db, alpha = 1)
plot(c4)
# with excluded points
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0)
plot(c5)
# with excluded points
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 0)
plot(c6)
# --> just checking