More work on trace clustering
This commit is contained in:
parent
72d2b6b799
commit
ea0660817a
@ -126,11 +126,177 @@ datcase$duration <- datcase$max_time - datcase$min_time
|
||||
datcase$min_time <- NULL
|
||||
datcase$max_time <- NULL
|
||||
|
||||
|
||||
check_infocards <- function(subdata, artworks) {
|
||||
infocard_only <- NULL
|
||||
if(any(unique(subdata$item) %in% artworks)) {
|
||||
infocard_only <- FALSE
|
||||
} else {
|
||||
infocard_only <- TRUE
|
||||
}
|
||||
as.numeric(infocard_only)
|
||||
}
|
||||
|
||||
artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
|
||||
|
||||
datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks)
|
||||
|
||||
|
||||
# Clean up NAs
|
||||
datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance)
|
||||
datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize)
|
||||
datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree)
|
||||
datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist)
|
||||
datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details)
|
||||
datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info)
|
||||
datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition)
|
||||
datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks)
|
||||
datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe)
|
||||
datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik)
|
||||
datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema)
|
||||
datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics)
|
||||
datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards)
|
||||
|
||||
|
||||
|
||||
cor_mat <- cor(datcase[, -1], use = "pairwise")
|
||||
diag(cor_mat) <- NA
|
||||
heatmap(cor_mat)
|
||||
|
||||
# TODO: Add info if all items of a case are information cards??
|
||||
|
||||
normalize <- function(x) {
|
||||
(x - min(x)) / (max(x) - min(x))
|
||||
}
|
||||
|
||||
|
||||
#df <- as.data.frame(lapply(datcase[, -1], normalize))
|
||||
df <- as.data.frame(lapply(datcase[, -1], scale))
|
||||
#df <- datcase[, -1]
|
||||
|
||||
# "Flatten" with PCA
|
||||
pc <- prcomp(df)
|
||||
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
||||
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||
|
||||
plot(coor_2d)
|
||||
rgl::plot3d(coor_3d)
|
||||
|
||||
#--------------- (2.1) K-Means clustering ---------------
|
||||
|
||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||
|
||||
k1 <- kmeans(df, 4)
|
||||
|
||||
grp_km <- k1$cluster
|
||||
table(grp_km)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp_km),
|
||||
palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
plot(coor_2d, col = mycols[grp_km])
|
||||
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
||||
|
||||
aggregate(. ~ grp_km, df, mean)
|
||||
|
||||
#--------------- (2.2) Hierarchical clustering ---------------
|
||||
|
||||
mat <- dist(df)
|
||||
|
||||
h1 <- hclust(mat, method = "average")
|
||||
h2 <- hclust(mat, method = "complete")
|
||||
h3 <- hclust(mat, method = "ward.D")
|
||||
h4 <- hclust(mat, method = "ward.D2")
|
||||
h5 <- hclust(mat, method = "single")
|
||||
|
||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
||||
c1 <- cophenetic(h1)
|
||||
c2 <- cophenetic(h2)
|
||||
c3 <- cophenetic(h3)
|
||||
c4 <- cophenetic(h4)
|
||||
c5 <- cophenetic(h5)
|
||||
|
||||
# Correlations
|
||||
cor(mat, c1)
|
||||
cor(mat, c2)
|
||||
cor(mat, c3)
|
||||
cor(mat, c4)
|
||||
cor(mat, c5)
|
||||
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||
|
||||
hc <- h4
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||
|
||||
k <- 4
|
||||
|
||||
grp_hclust <- cutree(hc, k = k)
|
||||
|
||||
table(grp_hclust)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp_hclust),
|
||||
palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
plot(coor_2d, col = mycols[grp_hclust])
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
||||
|
||||
table(dattree[grp_hclust == 1, "Pattern"])
|
||||
table(dattree[grp_hclust == 2, "Pattern"])
|
||||
table(dattree[grp_hclust == 3, "Pattern"])
|
||||
table(dattree[grp_hclust == 4, "Pattern"])
|
||||
|
||||
|
||||
aggregate(. ~ grp_hclust, df, mean)
|
||||
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
||||
mean)
|
||||
|
||||
#--------------- (2.3) DBSCAN clustering ---------------
|
||||
|
||||
library(dbscan)
|
||||
d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
|
||||
hullplot(df, d1)
|
||||
|
||||
grp_db <- d1$cluster
|
||||
table(grp_db)
|
||||
|
||||
kNNdistplot(df, k = ncol(df))
|
||||
abline(h = 0.2, col = "red")
|
||||
abline(h = 0.06, col = "red")
|
||||
|
||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
||||
#palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
mycols <- c("black", mycols)
|
||||
|
||||
plot(coor_2d, col = mycols[grp_db + 1])
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
||||
|
||||
aggregate(. ~ grp_db, df, mean)
|
||||
|
||||
table(dattree[grp_db == 0, "Pattern"])
|
||||
table(dattree[grp_db == 1, "Pattern"])
|
||||
table(dattree[grp_db == 2, "Pattern"])
|
||||
table(dattree[grp_db == 3, "Pattern"])
|
||||
|
||||
# Does not really work with these features!
|
||||
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
# Navigation types by Bousbia et al. (2010):
|
||||
# - Overviewing: this value is close to the Canter “scanning” value. It
|
||||
@ -181,26 +347,28 @@ heatmap(cor_mat)
|
||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||
|
||||
dattree <- data.frame(case = datcase$case,
|
||||
Duration = datcase$duration,
|
||||
#Duration = datcase$duration,
|
||||
PropItems = datcase$nitems / length(unique(dat$item)),
|
||||
PropTopic = datcase$nopenTopic / datcase$nflipCard,
|
||||
PropPopup = datcase$nopenPopup / datcase$nopenTopic,
|
||||
#PropTopic = datcase$nopenTopic / datcase$nflipCard,
|
||||
#PropPopup = datcase$nopenPopup / datcase$nopenTopic,
|
||||
# SearchInfo =
|
||||
# 2*(((datcase$nopenPopup / datcase$nopenTopic) *
|
||||
# (datcase$nopenTopic / datcase$nflipCard)) /
|
||||
# ((datcase$nopenPopup / datcase$nopenTopic) +
|
||||
# (datcase$nopenTopic / datcase$nflipCard))
|
||||
# ),
|
||||
SearchInfo = datcase$nopenTopic / datcase$nflipCard +
|
||||
datcase$nopenPopup / datcase$nopenTopic,
|
||||
PropMoves = datcase$nmove / datcase$length,
|
||||
PathLinearity = datcase$nitems / datcase$npaths,
|
||||
Singularity = datcase$npaths / datcase$length
|
||||
)
|
||||
|
||||
#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
|
||||
#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
|
||||
dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
|
||||
|
||||
dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
|
||||
dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
|
||||
#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
|
||||
#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
|
||||
|
||||
get_centrality <- function(case, data) {
|
||||
|
||||
@ -227,9 +395,10 @@ centrality <- do.call(rbind, centrality)
|
||||
|
||||
save(centrality, file = "results/haum/tmp_centrality.RData")
|
||||
|
||||
dattree$centr_degree <- centrality[, 1]
|
||||
dattree$centr_degree_loops <- centrality[, 2]
|
||||
dattree$centr_between <- centrality[, 3]
|
||||
#dattree$centr_degree <- centrality[, 1]
|
||||
#dattree$centr_degree_loops <- centrality[, 2]
|
||||
dattree$DegreeCentrality <- centrality[, 2]
|
||||
#dattree$BetweenCentrality <- centrality[, 3]
|
||||
|
||||
## Add average duration per item
|
||||
|
||||
@ -247,7 +416,7 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
|
||||
|
||||
rm(tmp)
|
||||
|
||||
plot(dattree)
|
||||
plot(dattree[, -1], pch = ".")
|
||||
|
||||
par(mfrow = c(3,4))
|
||||
hist(dattree$Duration, breaks = 50, main = "")
|
||||
@ -268,13 +437,13 @@ diag(cor_mat) <- NA
|
||||
heatmap(cor_mat)
|
||||
|
||||
|
||||
dattree$Pattern <- "Dispersion"
|
||||
dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
|
||||
dattree$Singularity > 0.8, "Scholar",
|
||||
dattree$Pattern)
|
||||
dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
||||
dattree$centr_between > 0.5, "Star",
|
||||
dattree$Pattern)
|
||||
# dattree$Pattern <- "Dispersion"
|
||||
# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
|
||||
# dattree$Singularity > 0.8, "Scholar",
|
||||
# dattree$Pattern)
|
||||
# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
||||
# dattree$centr_between > 0.5, "Star",
|
||||
# dattree$Pattern)
|
||||
|
||||
write.table(dattree,
|
||||
file = "results/haum/dattree.csv",
|
||||
@ -302,11 +471,22 @@ df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
|
||||
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
|
||||
|
||||
|
||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
df <- dattree[, -1]
|
||||
df$AvDurItem <- normalize(df$AvDurItem)
|
||||
df$SearchInfo <- normalize(df$SearchInfo)
|
||||
df$InfocardOnly <- datcase$infocardOnly
|
||||
summary(df)
|
||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
# "Flatten" with PCA
|
||||
pc <- prcomp(df)
|
||||
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
||||
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||
|
||||
plot(coor_2d)
|
||||
rgl::plot3d(coor_3d)
|
||||
|
||||
#--------------- (2.1) K-Means clustering ---------------
|
||||
|
||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||
@ -326,10 +506,11 @@ plot(coor_2d, col = mycols[grp_km])
|
||||
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
||||
|
||||
aggregate(. ~ grp_km, df, mean)
|
||||
|
||||
#--------------- (2.2) Hierarchical clustering ---------------
|
||||
|
||||
mat <- dist(df)
|
||||
# TODO: Do I need to scale all variables?
|
||||
|
||||
h1 <- hclust(mat, method = "average")
|
||||
h2 <- hclust(mat, method = "complete")
|
||||
@ -354,7 +535,7 @@ cor(mat, c5)
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||
|
||||
hc <- h1
|
||||
hc <- h4
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||
@ -390,7 +571,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||
#--------------- (2.3) DBSCAN clustering ---------------
|
||||
|
||||
library(dbscan)
|
||||
d1 <- dbscan(df, eps = .2, minPts = 9)
|
||||
d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
|
||||
hullplot(df, d1)
|
||||
|
||||
grp_db <- d1$cluster
|
||||
@ -398,13 +579,18 @@ table(grp_db)
|
||||
|
||||
kNNdistplot(df, k = ncol(df))
|
||||
abline(h = 0.2, col = "red")
|
||||
abline(h = 0.06, col = "red")
|
||||
|
||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
||||
palette = mycols,
|
||||
#palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
mycols <- c("black", mycols)
|
||||
|
||||
plot(coor_2d, col = mycols[grp_db + 1])
|
||||
legend("topleft", paste("Cl", 0:4), col = mycols, pch = 21)
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
||||
|
||||
aggregate(. ~ grp_db, df, mean)
|
||||
@ -462,17 +648,24 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
|
||||
library(rpart)
|
||||
library(partykit)
|
||||
|
||||
dattree_db <- dattree[grp_db != 0, ]
|
||||
dattree_db <- dattree[grp_db != 0, -1]
|
||||
dattree_db$grp <- factor(grp_db[grp_db != 0])
|
||||
dattree_db$Pattern <- factor(dattree_db$Pattern)
|
||||
|
||||
c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
|
||||
plot(as.party(c1))
|
||||
|
||||
c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
|
||||
plot(as.party(c2))
|
||||
|
||||
|
||||
|
||||
c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
|
||||
Pattern, data = dattree_db, method = "class")
|
||||
plot(as.party(c1))
|
||||
|
||||
# with conditional tree
|
||||
c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||
PropMoves + Pattern, data = dattree_db, alpha = 0.5)
|
||||
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
|
||||
plot(c2)
|
||||
|
||||
c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||
@ -484,8 +677,7 @@ c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||
plot(c4)
|
||||
|
||||
# with excluded points
|
||||
c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||
PropMoves + factor(Pattern), data = dattree, alpha = 1)
|
||||
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
|
||||
plot(c5)
|
||||
|
||||
# with excluded points
|
||||
@ -603,3 +795,5 @@ igraph::centr_degree(inet, loops = FALSE)
|
||||
igraph::centr_betw(inet)
|
||||
igraph::centr_clo(inet)
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user