More work on trace clustering

This commit is contained in:
Nora Wickelmaier 2024-03-04 17:32:50 +01:00
parent 72d2b6b799
commit ea0660817a

View File

@ -126,11 +126,177 @@ datcase$duration <- datcase$max_time - datcase$min_time
datcase$min_time <- NULL datcase$min_time <- NULL
datcase$max_time <- NULL datcase$max_time <- NULL
check_infocards <- function(subdata, artworks) {
infocard_only <- NULL
if(any(unique(subdata$item) %in% artworks)) {
infocard_only <- FALSE
} else {
infocard_only <- TRUE
}
as.numeric(infocard_only)
}
artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks)
# Clean up NAs
datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance)
datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize)
datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree)
datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist)
datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details)
datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info)
datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition)
datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks)
datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe)
datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik)
datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema)
datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics)
datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards)
cor_mat <- cor(datcase[, -1], use = "pairwise") cor_mat <- cor(datcase[, -1], use = "pairwise")
diag(cor_mat) <- NA diag(cor_mat) <- NA
heatmap(cor_mat) heatmap(cor_mat)
# TODO: Add info if all items of a case are information cards??
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
#df <- as.data.frame(lapply(datcase[, -1], normalize))
df <- as.data.frame(lapply(datcase[, -1], scale))
#df <- datcase[, -1]
# "Flatten" with PCA
pc <- prcomp(df)
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
plot(coor_2d)
rgl::plot3d(coor_3d)
#--------------- (2.1) K-Means clustering ---------------
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
k1 <- kmeans(df, 4)
grp_km <- k1$cluster
table(grp_km)
fviz_cluster(list(data = df, cluster = grp_km),
palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
plot(coor_2d, col = mycols[grp_km])
rgl::plot3d(coor_3d, col = mycols[grp_km])
aggregate(. ~ grp_km, df, mean)
#--------------- (2.2) Hierarchical clustering ---------------
mat <- dist(df)
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage (runs quite some time!)
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h4
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
k <- 4
grp_hclust <- cutree(hc, k = k)
table(grp_hclust)
fviz_cluster(list(data = df, cluster = grp_hclust),
palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
plot(coor_2d, col = mycols[grp_hclust])
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
table(dattree[grp_hclust == 1, "Pattern"])
table(dattree[grp_hclust == 2, "Pattern"])
table(dattree[grp_hclust == 3, "Pattern"])
table(dattree[grp_hclust == 4, "Pattern"])
aggregate(. ~ grp_hclust, df, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
mean)
#--------------- (2.3) DBSCAN clustering ---------------
library(dbscan)
d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
hullplot(df, d1)
grp_db <- d1$cluster
table(grp_db)
kNNdistplot(df, k = ncol(df))
abline(h = 0.2, col = "red")
abline(h = 0.06, col = "red")
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
#palette = mycols,
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_bw())
mycols <- c("black", mycols)
plot(coor_2d, col = mycols[grp_db + 1])
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
aggregate(. ~ grp_db, df, mean)
table(dattree[grp_db == 0, "Pattern"])
table(dattree[grp_db == 1, "Pattern"])
table(dattree[grp_db == 2, "Pattern"])
table(dattree[grp_db == 3, "Pattern"])
# Does not really work with these features!
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Navigation types by Bousbia et al. (2010): # Navigation types by Bousbia et al. (2010):
# - Overviewing: this value is close to the Canter “scanning” value. It # - Overviewing: this value is close to the Canter “scanning” value. It
@ -181,26 +347,28 @@ heatmap(cor_mat)
# * Duration per artwork is low: "ave_duration_item" / datcase$duration # * Duration per artwork is low: "ave_duration_item" / datcase$duration
dattree <- data.frame(case = datcase$case, dattree <- data.frame(case = datcase$case,
Duration = datcase$duration, #Duration = datcase$duration,
PropItems = datcase$nitems / length(unique(dat$item)), PropItems = datcase$nitems / length(unique(dat$item)),
PropTopic = datcase$nopenTopic / datcase$nflipCard, #PropTopic = datcase$nopenTopic / datcase$nflipCard,
PropPopup = datcase$nopenPopup / datcase$nopenTopic, #PropPopup = datcase$nopenPopup / datcase$nopenTopic,
# SearchInfo = # SearchInfo =
# 2*(((datcase$nopenPopup / datcase$nopenTopic) * # 2*(((datcase$nopenPopup / datcase$nopenTopic) *
# (datcase$nopenTopic / datcase$nflipCard)) / # (datcase$nopenTopic / datcase$nflipCard)) /
# ((datcase$nopenPopup / datcase$nopenTopic) + # ((datcase$nopenPopup / datcase$nopenTopic) +
# (datcase$nopenTopic / datcase$nflipCard)) # (datcase$nopenTopic / datcase$nflipCard))
# ), # ),
SearchInfo = datcase$nopenTopic / datcase$nflipCard +
datcase$nopenPopup / datcase$nopenTopic,
PropMoves = datcase$nmove / datcase$length, PropMoves = datcase$nmove / datcase$length,
PathLinearity = datcase$nitems / datcase$npaths, PathLinearity = datcase$nitems / datcase$npaths,
Singularity = datcase$npaths / datcase$length Singularity = datcase$npaths / datcase$length
) )
#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo) #dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic) #dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup) #dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
get_centrality <- function(case, data) { get_centrality <- function(case, data) {
@ -227,9 +395,10 @@ centrality <- do.call(rbind, centrality)
save(centrality, file = "results/haum/tmp_centrality.RData") save(centrality, file = "results/haum/tmp_centrality.RData")
dattree$centr_degree <- centrality[, 1] #dattree$centr_degree <- centrality[, 1]
dattree$centr_degree_loops <- centrality[, 2] #dattree$centr_degree_loops <- centrality[, 2]
dattree$centr_between <- centrality[, 3] dattree$DegreeCentrality <- centrality[, 2]
#dattree$BetweenCentrality <- centrality[, 3]
## Add average duration per item ## Add average duration per item
@ -247,7 +416,7 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
rm(tmp) rm(tmp)
plot(dattree) plot(dattree[, -1], pch = ".")
par(mfrow = c(3,4)) par(mfrow = c(3,4))
hist(dattree$Duration, breaks = 50, main = "") hist(dattree$Duration, breaks = 50, main = "")
@ -268,13 +437,13 @@ diag(cor_mat) <- NA
heatmap(cor_mat) heatmap(cor_mat)
dattree$Pattern <- "Dispersion" # dattree$Pattern <- "Dispersion"
dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & # dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
dattree$Singularity > 0.8, "Scholar", # dattree$Singularity > 0.8, "Scholar",
dattree$Pattern) # dattree$Pattern)
dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & # dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
dattree$centr_between > 0.5, "Star", # dattree$centr_between > 0.5, "Star",
dattree$Pattern) # dattree$Pattern)
write.table(dattree, write.table(dattree,
file = "results/haum/dattree.csv", file = "results/haum/dattree.csv",
@ -302,11 +471,22 @@ df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE)) (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
df <- dattree[, -1]
df$AvDurItem <- normalize(df$AvDurItem)
df$SearchInfo <- normalize(df$SearchInfo)
df$InfocardOnly <- datcase$infocardOnly
summary(df)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# "Flatten" with PCA # "Flatten" with PCA
pc <- prcomp(df) pc <- prcomp(df)
coor_2d <- as.data.frame(pc$x[, c(1, 2)]) coor_2d <- as.data.frame(pc$x[, c(1, 2)])
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)]) coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
plot(coor_2d)
rgl::plot3d(coor_3d)
#--------------- (2.1) K-Means clustering --------------- #--------------- (2.1) K-Means clustering ---------------
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
@ -326,10 +506,11 @@ plot(coor_2d, col = mycols[grp_km])
rgl::plot3d(coor_3d, col = mycols[grp_km]) rgl::plot3d(coor_3d, col = mycols[grp_km])
aggregate(. ~ grp_km, df, mean)
#--------------- (2.2) Hierarchical clustering --------------- #--------------- (2.2) Hierarchical clustering ---------------
mat <- dist(df) mat <- dist(df)
# TODO: Do I need to scale all variables?
h1 <- hclust(mat, method = "average") h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete") h2 <- hclust(mat, method = "complete")
@ -354,7 +535,7 @@ cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation # https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h1 hc <- h4
# Something like a scree plot (??) # Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -390,7 +571,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
#--------------- (2.3) DBSCAN clustering --------------- #--------------- (2.3) DBSCAN clustering ---------------
library(dbscan) library(dbscan)
d1 <- dbscan(df, eps = .2, minPts = 9) d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
hullplot(df, d1) hullplot(df, d1)
grp_db <- d1$cluster grp_db <- d1$cluster
@ -398,13 +579,18 @@ table(grp_db)
kNNdistplot(df, k = ncol(df)) kNNdistplot(df, k = ncol(df))
abline(h = 0.2, col = "red") abline(h = 0.2, col = "red")
abline(h = 0.06, col = "red")
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
palette = mycols, #palette = mycols,
ellipse.type = "convex", ellipse.type = "convex",
show.clust.cent = FALSE, show.clust.cent = FALSE,
ggtheme = theme_bw()) ggtheme = theme_bw())
mycols <- c("black", mycols)
plot(coor_2d, col = mycols[grp_db + 1])
legend("topleft", paste("Cl", 0:4), col = mycols, pch = 21)
rgl::plot3d(coor_3d, col = mycols[grp_db + 1]) rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
aggregate(. ~ grp_db, df, mean) aggregate(. ~ grp_db, df, mean)
@ -462,17 +648,24 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
library(rpart) library(rpart)
library(partykit) library(partykit)
dattree_db <- dattree[grp_db != 0, ] dattree_db <- dattree[grp_db != 0, -1]
dattree_db$grp <- factor(grp_db[grp_db != 0]) dattree_db$grp <- factor(grp_db[grp_db != 0])
dattree_db$Pattern <- factor(dattree_db$Pattern) dattree_db$Pattern <- factor(dattree_db$Pattern)
c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
plot(as.party(c1))
c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
plot(as.party(c2))
c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves + c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
Pattern, data = dattree_db, method = "class") Pattern, data = dattree_db, method = "class")
plot(as.party(c1)) plot(as.party(c1))
# with conditional tree # with conditional tree
c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
PropMoves + Pattern, data = dattree_db, alpha = 0.5)
plot(c2) plot(c2)
c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
@ -484,8 +677,7 @@ c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
plot(c4) plot(c4)
# with excluded points # with excluded points
c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup + c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
PropMoves + factor(Pattern), data = dattree, alpha = 1)
plot(c5) plot(c5)
# with excluded points # with excluded points
@ -603,3 +795,5 @@ igraph::centr_degree(inet, loops = FALSE)
igraph::centr_betw(inet) igraph::centr_betw(inet)
igraph::centr_clo(inet) igraph::centr_clo(inet)