Added partitioning to user navigation script
This commit is contained in:
parent
42f12b9256
commit
e7eb2cb784
@ -4,13 +4,14 @@
|
||||
# (1.1) Read log event data
|
||||
# (1.2) Extract additional infos for clustering
|
||||
# (2) Clustering
|
||||
# (3) Fit tree
|
||||
# (3) Investigate variants
|
||||
#
|
||||
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
|
||||
# output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
|
||||
# results/haum/dattree.csv
|
||||
#
|
||||
# last mod: 2024-02-23
|
||||
# last mod: 2024-02-27
|
||||
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||
@ -176,10 +177,6 @@ heatmap(cor_mat)
|
||||
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
|
||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||
|
||||
# TODO: Come up with relevant features for navigation behavior
|
||||
|
||||
|
||||
|
||||
dattree <- data.frame(case = datcase$case,
|
||||
Duration = datcase$duration,
|
||||
PropItems = datcase$nitems / length(unique(dat$item)),
|
||||
@ -257,47 +254,15 @@ write.table(dattree,
|
||||
quote = FALSE,
|
||||
row.names = FALSE)
|
||||
|
||||
|
||||
tmp <- dat
|
||||
tmp$start <- tmp$date.start
|
||||
tmp$complete <- tmp$date.stop
|
||||
|
||||
|
||||
alog <- activitylog(tmp[tmp$case == 3448, ],
|
||||
case_id = "case",
|
||||
activity_id = "item",
|
||||
resource_id = "path",
|
||||
timestamps = c("start", "complete"))
|
||||
|
||||
process_map(alog)
|
||||
|
||||
net <- process_map(alog, render = FALSE)
|
||||
#DiagrammeR::get_node_df(net)
|
||||
|
||||
DiagrammeR::get_node_info(net)
|
||||
|
||||
DiagrammeR::get_degree_distribution(net)
|
||||
|
||||
DiagrammeR::get_degree_in(net)
|
||||
DiagrammeR::get_degree_out(net)
|
||||
DiagrammeR::get_degree_total(net)
|
||||
|
||||
|
||||
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
|
||||
|
||||
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
|
||||
|
||||
inet <- DiagrammeR::to_igraph(net)
|
||||
igraph::centr_degree(inet, loops = FALSE)
|
||||
igraph::centr_betw(inet)
|
||||
igraph::centr_clo(inet)
|
||||
#--------------- (2) Clustering ---------------
|
||||
|
||||
df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
|
||||
# TODO: With or without duration? Why is it relevant?
|
||||
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
||||
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
||||
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
||||
# scale Duration and min/max SearchInfo
|
||||
df$Duration <- scale(df$Duration)
|
||||
df$Duration <- as.numeric(scale(df$Duration))
|
||||
df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
||||
(max(df$SearchInfo) - min(df$SearchInfo))
|
||||
|
||||
@ -319,20 +284,20 @@ c5 <- cophenetic(h5)
|
||||
|
||||
# Correlations
|
||||
cor(mat, c1)
|
||||
# 0.9029232
|
||||
# 0.8854558
|
||||
cor(mat, c2)
|
||||
# 0.8879478
|
||||
# 0.883313
|
||||
cor(mat, c3)
|
||||
# 0.5747296
|
||||
# 0.5368663
|
||||
cor(mat, c4)
|
||||
# 0.5994121
|
||||
# 0.725247
|
||||
cor(mat, c5)
|
||||
# 0.5292353
|
||||
# 0.3895215
|
||||
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||
|
||||
hc <- h1
|
||||
hc <- h4
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||
@ -345,118 +310,51 @@ df$grp <- grp
|
||||
table(grp)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp),
|
||||
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
|
||||
# "#000000", "#434F4F"),
|
||||
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
# Look at 3d plot to see if clusters are actually separate
|
||||
pc <- prcomp(df)
|
||||
coor <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||
coor$grp <- df$grp
|
||||
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
dattree$grp <- grp
|
||||
table(dattree[dattree$grp == 1, "Pattern"])
|
||||
table(dattree[dattree$grp == 2, "Pattern"])
|
||||
table(dattree[dattree$grp == 3, "Pattern"])
|
||||
table(dattree[dattree$grp == 4, "Pattern"])
|
||||
|
||||
|
||||
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
|
||||
Singularity, centr_degree, centr_degree_loops,
|
||||
centr_between) ~ grp, dattree, mean)
|
||||
|
||||
|
||||
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
|
||||
Scholar, Star) ~ grp, df, mean)
|
||||
|
||||
### Look at selected cases ###########################################
|
||||
tmp <- dat
|
||||
tmp$start <- tmp$date.start
|
||||
tmp$complete <- tmp$date.stop
|
||||
|
||||
# "We first extract the graph sub-sequences corresponding to the four
|
||||
# patterns of Canter et al. (1985). We also identified the number of nodes
|
||||
# to which the learner often goes back to (Fig. 4). These nodes are called
|
||||
# “central nodes”. If the number of central nodes is lower than or equal to
|
||||
# half of the sub-sequences, the browsing pattern indicator takes on the
|
||||
# value “Star”." Bousbia et al. (2010)
|
||||
alog <- activitylog(tmp[tmp$case == 3448, ],
|
||||
case_id = "case",
|
||||
activity_id = "item",
|
||||
resource_id = "path",
|
||||
timestamps = c("start", "complete"))
|
||||
|
||||
# I do not know how they got the sub-sequences. I am taking the ratio of
|
||||
# strongly connected nodes to weakly connected nodes. If the number of
|
||||
# weakly connected nodes is twice as high, the pattern is classified as a
|
||||
# star, i.e., NodeConnect <= 0.5.
|
||||
# TODO: This does not make sense, smallest and most frequent number is 3!
|
||||
# (and I do not understand it...)
|
||||
process_map(alog)
|
||||
######################################################################
|
||||
|
||||
|
||||
# count_asymmetric_node_pairs Get the number of asymmetrically-connected node pairs
|
||||
# count_edges Get a count of all edges
|
||||
# count_loop_edges Get count of all loop edges
|
||||
# count_mutual_node_pairs Get the number of mutually-connected node pairs
|
||||
# count_unconnected_node_pairs Get the number of unconnected node pairs
|
||||
# count_unconnected_nodes Get count of all unconnected nodes
|
||||
|
||||
|
||||
# TODO: Read up on centrality measures
|
||||
# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/
|
||||
# https://www.datacamp.com/tutorial/centrality-network-analysis-R
|
||||
# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf
|
||||
# https://link.springer.com/article/10.1007/s10618-024-01003-4
|
||||
|
||||
#--------------- (2) Clustering ---------------
|
||||
|
||||
df <- na.omit(datcase[, c("duration", "distance", "scaleSize",
|
||||
"rotationDegree", "length", "nmove",
|
||||
"nitems", "npaths")])
|
||||
|
||||
#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")])
|
||||
mat <- dist(scale(df))
|
||||
#mat <- dist(df)
|
||||
|
||||
h1 <- hclust(mat, method = "average")
|
||||
h2 <- hclust(mat, method = "complete")
|
||||
h3 <- hclust(mat, method = "ward.D")
|
||||
h4 <- hclust(mat, method = "ward.D2")
|
||||
h5 <- hclust(mat, method = "single")
|
||||
|
||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
||||
c1 <- cophenetic(h1)
|
||||
c2 <- cophenetic(h2)
|
||||
c3 <- cophenetic(h3)
|
||||
c4 <- cophenetic(h4)
|
||||
c5 <- cophenetic(h5)
|
||||
|
||||
# Correlations
|
||||
cor(mat, c1)
|
||||
cor(mat, c2)
|
||||
cor(mat, c3)
|
||||
cor(mat, c4)
|
||||
cor(mat, c5)
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||
|
||||
hc <- h1
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||
|
||||
|
||||
# TODO: Something is wrong
|
||||
|
||||
k <- 4
|
||||
|
||||
grp <- cutree(hc, k = k)
|
||||
df$grp <- grp
|
||||
|
||||
table(grp)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp),
|
||||
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
|
||||
# "#000000", "#434F4F"),
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
|
||||
nitems, npaths) ~ grp, df, mean)
|
||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
|
||||
nitems, npaths) ~ grp, df, median)
|
||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
|
||||
nitems, npaths) ~ grp, df, max)
|
||||
|
||||
|
||||
df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize",
|
||||
"rotationDegree", "length", "nmove",
|
||||
"nitems", "npaths")])$case
|
||||
|
||||
res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE)
|
||||
res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
|
||||
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||
|
||||
xtabs( ~ item + grp, res)
|
||||
@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res)
|
||||
vioplot::vioplot(scaleSize ~ grp, res)
|
||||
vioplot::vioplot(rotationDegree ~ grp, res)
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
|
||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
|
||||
|
||||
write.table(res,
|
||||
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
|
||||
sep = ";",
|
||||
quote = FALSE,
|
||||
row.names = FALSE)
|
||||
|
||||
#--------------- (3) Investigate variants ---------------
|
||||
#--------------- (3) Fit tree ---------------
|
||||
|
||||
library(rpart)
|
||||
library(partykit)
|
||||
|
||||
dattree$Duration_scaled <- scale(dattree$Duration)
|
||||
dattree$grp <- factor(dattree$grp)
|
||||
dattree$Pattern <- factor(dattree$Pattern)
|
||||
|
||||
c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
|
||||
Pattern, data = dattree, method = "class")
|
||||
|
||||
plot(as.party(c1))
|
||||
|
||||
c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
|
||||
data = dattree, method = "class")
|
||||
|
||||
plot(as.party(c2))
|
||||
|
||||
#--------------- (4) Investigate variants ---------------
|
||||
|
||||
res$start <- res$date.start
|
||||
res$complete <- res$date.stop
|
||||
@ -558,3 +478,30 @@ for (case in cases) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
########################### TODO: Still need it?
|
||||
|
||||
|
||||
net <- process_map(alog, render = FALSE)
|
||||
#DiagrammeR::get_node_df(net)
|
||||
|
||||
DiagrammeR::get_node_info(net)
|
||||
|
||||
DiagrammeR::get_degree_distribution(net)
|
||||
|
||||
DiagrammeR::get_degree_in(net)
|
||||
DiagrammeR::get_degree_out(net)
|
||||
DiagrammeR::get_degree_total(net)
|
||||
|
||||
|
||||
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
|
||||
|
||||
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
|
||||
|
||||
inet <- DiagrammeR::to_igraph(net)
|
||||
igraph::centr_degree(inet, loops = FALSE)
|
||||
igraph::centr_betw(inet)
|
||||
igraph::centr_clo(inet)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user