End of day commit
This commit is contained in:
parent
4eca6c81d6
commit
3cf6c4c51d
@ -166,16 +166,14 @@ normalize <- function(x) {
|
|||||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||||
|
|
||||||
dattree <- data.frame(case = datcase$case,
|
dattree <- data.frame(case = datcase$case,
|
||||||
NumItems = datcase$nitems,
|
PropItems = datcase$nitems / length(unique(dat$item)),
|
||||||
NumTopic = datcase$nopenTopic,
|
SearchInfo = datcase$nopenTopic + datcase$nopenPopup,
|
||||||
NumPopup = datcase$nopenPopup,
|
|
||||||
PropMoves = datcase$nmove / datcase$length,
|
PropMoves = datcase$nmove / datcase$length,
|
||||||
PathLinearity = datcase$nitems / datcase$npaths,
|
PathLinearity = datcase$nitems / datcase$npaths,
|
||||||
Singularity = datcase$npaths / datcase$length
|
Singularity = datcase$npaths / datcase$length
|
||||||
)
|
)
|
||||||
|
|
||||||
dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
|
dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
|
||||||
dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
|
|
||||||
|
|
||||||
get_centrality <- function(case, data) {
|
get_centrality <- function(case, data) {
|
||||||
|
|
||||||
@ -232,7 +230,12 @@ dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
|||||||
dattree$BetweenCentrality > 0.5, "Star",
|
dattree$BetweenCentrality > 0.5, "Star",
|
||||||
dattree$Pattern)
|
dattree$Pattern)
|
||||||
dattree$Pattern <- factor(dattree$Pattern)
|
dattree$Pattern <- factor(dattree$Pattern)
|
||||||
|
# TODO: Get rid of PathLinearity and Singularity as features when I am
|
||||||
|
# using Pattern?
|
||||||
|
|
||||||
|
dattree$PathLinearity <- NULL
|
||||||
|
dattree$Singularity <- NULL
|
||||||
|
dattree$BetweenCentrality <- NULL
|
||||||
|
|
||||||
summary(dattree)
|
summary(dattree)
|
||||||
|
|
||||||
@ -248,6 +251,7 @@ hist(dattree$PathLinearity, breaks = 50, main = "")
|
|||||||
hist(dattree$Singularity, breaks = 50, main = "")
|
hist(dattree$Singularity, breaks = 50, main = "")
|
||||||
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||||
|
|
||||||
|
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
# Remove cases with extreme outliers
|
# Remove cases with extreme outliers
|
||||||
# TODO: Do I want this???
|
# TODO: Do I want this???
|
||||||
|
|
||||||
@ -267,58 +271,31 @@ hist(dattree$PropMoves, breaks = 50, main = "")
|
|||||||
hist(dattree$PathLinearity, breaks = 50, main = "")
|
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||||
hist(dattree$Singularity, breaks = 50, main = "")
|
hist(dattree$Singularity, breaks = 50, main = "")
|
||||||
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||||
|
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
|
||||||
#--------------- (2) Clustering ---------------
|
#--------------- (2) Clustering ---------------
|
||||||
|
|
||||||
#library(cluster)
|
df <- dattree[, -1] # remove case variable
|
||||||
|
|
||||||
#df <- dattree[, -1] # remove case variable
|
# Normalize Duration and SearchInfo
|
||||||
# TODO: Do I need to scale or does normalization also work?
|
df$AvDurItem <- normalize(df$AvDurItem)
|
||||||
|
df$SearchInfo <- normalize(df$SearchInfo)
|
||||||
|
|
||||||
# Normalize Duration and Numbers
|
summary(df)
|
||||||
# df$AvDurItem <- normalize(df$AvDurItem)
|
|
||||||
# df$NumItems <- normalize(df$NumItems)
|
|
||||||
# df$NumTopic <- normalize(df$NumTopic)
|
|
||||||
# df$NumPopup <- normalize(df$NumPopup)
|
|
||||||
|
|
||||||
# summary(df)
|
|
||||||
|
|
||||||
# Look at collinearity
|
# Look at collinearity
|
||||||
# cor_mat <- cor(df)
|
cor_mat <- cor(df)
|
||||||
# diag(cor_mat) <- NA
|
diag(cor_mat) <- NA
|
||||||
# heatmap(cor_mat)
|
heatmap(cor_mat)
|
||||||
|
|
||||||
#df <- as.data.frame(scale(dattree[, -1]))
|
|
||||||
|
|
||||||
|
|
||||||
#--------------- (2.2) Hierarchical clustering ---------------
|
#--------------- (2.2) Hierarchical clustering ---------------
|
||||||
|
|
||||||
dist_mat <- cluster::daisy(dattree[, -1], metric = "gower")
|
dist_mat <- cluster::daisy(df, metric = "gower")
|
||||||
|
|
||||||
# # "Flatten" with PCA
|
|
||||||
# mm <- model.matrix( ~ ., df)[, -1] # remove intercept
|
|
||||||
# tmp <- as.data.frame(lapply(as.data.frame(mm), normalize))
|
|
||||||
# pc <- prcomp(mm)
|
|
||||||
# coor_2d <- as.data.frame(pc$x[, 1:2])
|
|
||||||
# coor_3d <- as.data.frame(pc$x[, 1:3])
|
|
||||||
|
|
||||||
# "Flatten" with MDS
|
# "Flatten" with MDS
|
||||||
coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
|
coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
|
||||||
coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
|
coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
|
||||||
|
# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances?
|
||||||
# Idea from
|
|
||||||
# https://stats.stackexchange.com/questions/264912/mds-on-large-dataset-r-or-python
|
|
||||||
# https://www.inf.uni-konstanz.de/exalgo/software/mdsj/
|
|
||||||
write.table(as.matrix(dist_mat), file = "mds/dist_mat.txt", row.names = FALSE,
|
|
||||||
col.names = FALSE)
|
|
||||||
# Run java script
|
|
||||||
system("java -jar mdsj.jar -d2 mds/dist_mat.txt mds/mds_coor_2d.txt")
|
|
||||||
system("java -jar mdsj.jar -d3 mds/dist_mat.txt mds/mds_coor_3d.txt")
|
|
||||||
|
|
||||||
coor_2d_java <- read.table("mds/mds_coor_2d.txt", header = FALSE, sep = " ")
|
|
||||||
plot(coor_2d_java)
|
|
||||||
|
|
||||||
|
|
||||||
plot(coor_2d)
|
plot(coor_2d)
|
||||||
rgl::plot3d(coor_3d)
|
rgl::plot3d(coor_3d)
|
||||||
@ -326,8 +303,10 @@ rgl::plot3d(coor_3d)
|
|||||||
method <- c(average = "average", single = "single", complete = "complete",
|
method <- c(average = "average", single = "single", complete = "complete",
|
||||||
ward = "ward")
|
ward = "ward")
|
||||||
|
|
||||||
|
method <- "ward"
|
||||||
|
|
||||||
hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
|
hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
|
||||||
acs <- pbapply::sapply(hcs, function(x) x$ac)
|
acs <- pbapply::pbsapply(hcs, function(x) x$ac)
|
||||||
|
|
||||||
hc <- hcs$ward
|
hc <- hcs$ward
|
||||||
|
|
||||||
@ -351,6 +330,11 @@ table(dattree[cluster == 2, "Pattern"])
|
|||||||
table(dattree[cluster == 3, "Pattern"])
|
table(dattree[cluster == 3, "Pattern"])
|
||||||
table(dattree[cluster == 4, "Pattern"])
|
table(dattree[cluster == 4, "Pattern"])
|
||||||
|
|
||||||
|
table(dattree[cluster == 1, "InfocardOnly"])
|
||||||
|
table(dattree[cluster == 2, "InfocardOnly"])
|
||||||
|
table(dattree[cluster == 3, "InfocardOnly"])
|
||||||
|
table(dattree[cluster == 4, "InfocardOnly"])
|
||||||
|
|
||||||
aggregate(. ~ cluster, df, mean)
|
aggregate(. ~ cluster, df, mean)
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||||
@ -397,7 +381,7 @@ write.table(res,
|
|||||||
quote = FALSE,
|
quote = FALSE,
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
save(res, dist_mat, hcs, acs, datcase, dattree,
|
save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d,
|
||||||
file = "results/haum/tmp_user-navigation.RData")
|
file = "results/haum/tmp_user-navigation.RData")
|
||||||
|
|
||||||
#--------------- (3) Fit tree ---------------
|
#--------------- (3) Fit tree ---------------
|
||||||
|
Loading…
Reference in New Issue
Block a user