End of day commit
This commit is contained in:
parent
4eca6c81d6
commit
3cf6c4c51d
@ -166,16 +166,14 @@ normalize <- function(x) {
|
||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||
|
||||
dattree <- data.frame(case = datcase$case,
|
||||
NumItems = datcase$nitems,
|
||||
NumTopic = datcase$nopenTopic,
|
||||
NumPopup = datcase$nopenPopup,
|
||||
PropItems = datcase$nitems / length(unique(dat$item)),
|
||||
SearchInfo = datcase$nopenTopic + datcase$nopenPopup,
|
||||
PropMoves = datcase$nmove / datcase$length,
|
||||
PathLinearity = datcase$nitems / datcase$npaths,
|
||||
Singularity = datcase$npaths / datcase$length
|
||||
)
|
||||
|
||||
dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
|
||||
dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
|
||||
dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
|
||||
|
||||
get_centrality <- function(case, data) {
|
||||
|
||||
@ -232,7 +230,12 @@ dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
||||
dattree$BetweenCentrality > 0.5, "Star",
|
||||
dattree$Pattern)
|
||||
dattree$Pattern <- factor(dattree$Pattern)
|
||||
# TODO: Get rid of PathLinearity and Singularity as features when I am
|
||||
# using Pattern?
|
||||
|
||||
dattree$PathLinearity <- NULL
|
||||
dattree$Singularity <- NULL
|
||||
dattree$BetweenCentrality <- NULL
|
||||
|
||||
summary(dattree)
|
||||
|
||||
@ -248,6 +251,7 @@ hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||
hist(dattree$Singularity, breaks = 50, main = "")
|
||||
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||
|
||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
# Remove cases with extreme outliers
|
||||
# TODO: Do I want this???
|
||||
|
||||
@ -267,58 +271,31 @@ hist(dattree$PropMoves, breaks = 50, main = "")
|
||||
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||
hist(dattree$Singularity, breaks = 50, main = "")
|
||||
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||
|
||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
#--------------- (2) Clustering ---------------
|
||||
|
||||
#library(cluster)
|
||||
df <- dattree[, -1] # remove case variable
|
||||
|
||||
#df <- dattree[, -1] # remove case variable
|
||||
# TODO: Do I need to scale or does normalization also work?
|
||||
# Normalize Duration and SearchInfo
|
||||
df$AvDurItem <- normalize(df$AvDurItem)
|
||||
df$SearchInfo <- normalize(df$SearchInfo)
|
||||
|
||||
# Normalize Duration and Numbers
|
||||
# df$AvDurItem <- normalize(df$AvDurItem)
|
||||
# df$NumItems <- normalize(df$NumItems)
|
||||
# df$NumTopic <- normalize(df$NumTopic)
|
||||
# df$NumPopup <- normalize(df$NumPopup)
|
||||
|
||||
# summary(df)
|
||||
summary(df)
|
||||
|
||||
# Look at collinearity
|
||||
# cor_mat <- cor(df)
|
||||
# diag(cor_mat) <- NA
|
||||
# heatmap(cor_mat)
|
||||
|
||||
#df <- as.data.frame(scale(dattree[, -1]))
|
||||
|
||||
cor_mat <- cor(df)
|
||||
diag(cor_mat) <- NA
|
||||
heatmap(cor_mat)
|
||||
|
||||
#--------------- (2.2) Hierarchical clustering ---------------
|
||||
|
||||
dist_mat <- cluster::daisy(dattree[, -1], metric = "gower")
|
||||
|
||||
# # "Flatten" with PCA
|
||||
# mm <- model.matrix( ~ ., df)[, -1] # remove intercept
|
||||
# tmp <- as.data.frame(lapply(as.data.frame(mm), normalize))
|
||||
# pc <- prcomp(mm)
|
||||
# coor_2d <- as.data.frame(pc$x[, 1:2])
|
||||
# coor_3d <- as.data.frame(pc$x[, 1:3])
|
||||
dist_mat <- cluster::daisy(df, metric = "gower")
|
||||
|
||||
# "Flatten" with MDS
|
||||
coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
|
||||
coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
|
||||
|
||||
# Idea from
|
||||
# https://stats.stackexchange.com/questions/264912/mds-on-large-dataset-r-or-python
|
||||
# https://www.inf.uni-konstanz.de/exalgo/software/mdsj/
|
||||
write.table(as.matrix(dist_mat), file = "mds/dist_mat.txt", row.names = FALSE,
|
||||
col.names = FALSE)
|
||||
# Run java script
|
||||
system("java -jar mdsj.jar -d2 mds/dist_mat.txt mds/mds_coor_2d.txt")
|
||||
system("java -jar mdsj.jar -d3 mds/dist_mat.txt mds/mds_coor_3d.txt")
|
||||
|
||||
coor_2d_java <- read.table("mds/mds_coor_2d.txt", header = FALSE, sep = " ")
|
||||
plot(coor_2d_java)
|
||||
|
||||
# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances?
|
||||
|
||||
plot(coor_2d)
|
||||
rgl::plot3d(coor_3d)
|
||||
@ -326,8 +303,10 @@ rgl::plot3d(coor_3d)
|
||||
method <- c(average = "average", single = "single", complete = "complete",
|
||||
ward = "ward")
|
||||
|
||||
method <- "ward"
|
||||
|
||||
hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
|
||||
acs <- pbapply::sapply(hcs, function(x) x$ac)
|
||||
acs <- pbapply::pbsapply(hcs, function(x) x$ac)
|
||||
|
||||
hc <- hcs$ward
|
||||
|
||||
@ -351,6 +330,11 @@ table(dattree[cluster == 2, "Pattern"])
|
||||
table(dattree[cluster == 3, "Pattern"])
|
||||
table(dattree[cluster == 4, "Pattern"])
|
||||
|
||||
table(dattree[cluster == 1, "InfocardOnly"])
|
||||
table(dattree[cluster == 2, "InfocardOnly"])
|
||||
table(dattree[cluster == 3, "InfocardOnly"])
|
||||
table(dattree[cluster == 4, "InfocardOnly"])
|
||||
|
||||
aggregate(. ~ cluster, df, mean)
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||
@ -397,7 +381,7 @@ write.table(res,
|
||||
quote = FALSE,
|
||||
row.names = FALSE)
|
||||
|
||||
save(res, dist_mat, hcs, acs, datcase, dattree,
|
||||
save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d,
|
||||
file = "results/haum/tmp_user-navigation.RData")
|
||||
|
||||
#--------------- (3) Fit tree ---------------
|
||||
|
Loading…
Reference in New Issue
Block a user