End of day commit

This commit is contained in:
Nora Wickelmaier 2024-03-08 09:22:10 +01:00
parent 4eca6c81d6
commit 3cf6c4c51d

View File

@ -166,16 +166,14 @@ normalize <- function(x) {
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
dattree <- data.frame(case = datcase$case,
NumItems = datcase$nitems,
NumTopic = datcase$nopenTopic,
NumPopup = datcase$nopenPopup,
PropItems = datcase$nitems / length(unique(dat$item)),
SearchInfo = datcase$nopenTopic + datcase$nopenPopup,
PropMoves = datcase$nmove / datcase$length,
PathLinearity = datcase$nitems / datcase$npaths,
Singularity = datcase$npaths / datcase$length
)
dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
get_centrality <- function(case, data) {
@ -232,7 +230,12 @@ dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
dattree$BetweenCentrality > 0.5, "Star",
dattree$Pattern)
dattree$Pattern <- factor(dattree$Pattern)
# TODO: Get rid of PathLinearity and Singularity as features when I am
# using Pattern?
dattree$PathLinearity <- NULL
dattree$Singularity <- NULL
dattree$BetweenCentrality <- NULL
summary(dattree)
@ -248,6 +251,7 @@ hist(dattree$PathLinearity, breaks = 50, main = "")
hist(dattree$Singularity, breaks = 50, main = "")
hist(dattree$BetweenCentrality, breaks = 50, main = "")
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Remove cases with extreme outliers
# TODO: Do I want this???
@ -267,58 +271,31 @@ hist(dattree$PropMoves, breaks = 50, main = "")
hist(dattree$PathLinearity, breaks = 50, main = "")
hist(dattree$Singularity, breaks = 50, main = "")
hist(dattree$BetweenCentrality, breaks = 50, main = "")
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#--------------- (2) Clustering ---------------
#library(cluster)
df <- dattree[, -1] # remove case variable
#df <- dattree[, -1] # remove case variable
# TODO: Do I need to scale or does normalization also work?
# Normalize Duration and SearchInfo
df$AvDurItem <- normalize(df$AvDurItem)
df$SearchInfo <- normalize(df$SearchInfo)
# Normalize Duration and Numbers
# df$AvDurItem <- normalize(df$AvDurItem)
# df$NumItems <- normalize(df$NumItems)
# df$NumTopic <- normalize(df$NumTopic)
# df$NumPopup <- normalize(df$NumPopup)
# summary(df)
summary(df)
# Look at collinearity
# cor_mat <- cor(df)
# diag(cor_mat) <- NA
# heatmap(cor_mat)
#df <- as.data.frame(scale(dattree[, -1]))
cor_mat <- cor(df)
diag(cor_mat) <- NA
heatmap(cor_mat)
#--------------- (2.2) Hierarchical clustering ---------------
dist_mat <- cluster::daisy(dattree[, -1], metric = "gower")
# # "Flatten" with PCA
# mm <- model.matrix( ~ ., df)[, -1] # remove intercept
# tmp <- as.data.frame(lapply(as.data.frame(mm), normalize))
# pc <- prcomp(mm)
# coor_2d <- as.data.frame(pc$x[, 1:2])
# coor_3d <- as.data.frame(pc$x[, 1:3])
dist_mat <- cluster::daisy(df, metric = "gower")
# "Flatten" with MDS
coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
# Idea from
# https://stats.stackexchange.com/questions/264912/mds-on-large-dataset-r-or-python
# https://www.inf.uni-konstanz.de/exalgo/software/mdsj/
write.table(as.matrix(dist_mat), file = "mds/dist_mat.txt", row.names = FALSE,
col.names = FALSE)
# Run java script
system("java -jar mdsj.jar -d2 mds/dist_mat.txt mds/mds_coor_2d.txt")
system("java -jar mdsj.jar -d3 mds/dist_mat.txt mds/mds_coor_3d.txt")
coor_2d_java <- read.table("mds/mds_coor_2d.txt", header = FALSE, sep = " ")
plot(coor_2d_java)
# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances?
plot(coor_2d)
rgl::plot3d(coor_3d)
@ -326,8 +303,10 @@ rgl::plot3d(coor_3d)
method <- c(average = "average", single = "single", complete = "complete",
ward = "ward")
method <- "ward"
hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
acs <- pbapply::sapply(hcs, function(x) x$ac)
acs <- pbapply::pbsapply(hcs, function(x) x$ac)
hc <- hcs$ward
@ -351,6 +330,11 @@ table(dattree[cluster == 2, "Pattern"])
table(dattree[cluster == 3, "Pattern"])
table(dattree[cluster == 4, "Pattern"])
table(dattree[cluster == 1, "InfocardOnly"])
table(dattree[cluster == 2, "InfocardOnly"])
table(dattree[cluster == 3, "InfocardOnly"])
table(dattree[cluster == 4, "InfocardOnly"])
aggregate(. ~ cluster, df, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
@ -397,7 +381,7 @@ write.table(res,
quote = FALSE,
row.names = FALSE)
save(res, dist_mat, hcs, acs, datcase, dattree,
save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d,
file = "results/haum/tmp_user-navigation.RData")
#--------------- (3) Fit tree ---------------