Added partitioning to user navigation script

This commit is contained in:
Nora Wickelmaier 2024-02-27 17:28:01 +01:00
parent 42f12b9256
commit e7eb2cb784

View File

@ -4,13 +4,14 @@
# (1.1) Read log event data
# (1.2) Extract additional infos for clustering
# (2) Clustering
# (3) Fit tree
# (3) Investigate variants
#
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
# output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
# results/haum/dattree.csv
#
# last mod: 2024-02-23
# last mod: 2024-02-27
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
@ -176,10 +177,6 @@ heatmap(cor_mat)
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
# TODO: Come up with relevant features for navigation behavior
dattree <- data.frame(case = datcase$case,
Duration = datcase$duration,
PropItems = datcase$nitems / length(unique(dat$item)),
@ -257,47 +254,15 @@ write.table(dattree,
quote = FALSE,
row.names = FALSE)
tmp <- dat
tmp$start <- tmp$date.start
tmp$complete <- tmp$date.stop
alog <- activitylog(tmp[tmp$case == 3448, ],
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
process_map(alog)
net <- process_map(alog, render = FALSE)
#DiagrammeR::get_node_df(net)
DiagrammeR::get_node_info(net)
DiagrammeR::get_degree_distribution(net)
DiagrammeR::get_degree_in(net)
DiagrammeR::get_degree_out(net)
DiagrammeR::get_degree_total(net)
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
inet <- DiagrammeR::to_igraph(net)
igraph::centr_degree(inet, loops = FALSE)
igraph::centr_betw(inet)
igraph::centr_clo(inet)
#--------------- (2) Clustering ---------------
df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
# TODO: With or without duration? Why is it relevant?
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
# scale Duration and min/max SearchInfo
df$Duration <- scale(df$Duration)
df$Duration <- as.numeric(scale(df$Duration))
df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
(max(df$SearchInfo) - min(df$SearchInfo))
@ -319,20 +284,20 @@ c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
# 0.9029232
# 0.8854558
cor(mat, c2)
# 0.8879478
# 0.883313
cor(mat, c3)
# 0.5747296
# 0.5368663
cor(mat, c4)
# 0.5994121
# 0.725247
cor(mat, c5)
# 0.5292353
# 0.3895215
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h1
hc <- h4
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -345,118 +310,51 @@ df$grp <- grp
table(grp)
fviz_cluster(list(data = df, cluster = grp),
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
# "#000000", "#434F4F"),
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
ellipse.type = "convex",
show.clust.cent = FALSE, ggtheme = theme_bw())
show.clust.cent = FALSE,
ggtheme = theme_bw())
# Look at 3d plot to see if clusters are actually separate
pc <- prcomp(df)
coor <- as.data.frame(pc$x[, c(1, 2, 3)])
coor$grp <- df$grp
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
dattree$grp <- grp
table(dattree[dattree$grp == 1, "Pattern"])
table(dattree[dattree$grp == 2, "Pattern"])
table(dattree[dattree$grp == 3, "Pattern"])
table(dattree[dattree$grp == 4, "Pattern"])
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
Singularity, centr_degree, centr_degree_loops,
centr_between) ~ grp, dattree, mean)
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
Scholar, Star) ~ grp, df, mean)
### Look at selected cases ###########################################
tmp <- dat
tmp$start <- tmp$date.start
tmp$complete <- tmp$date.stop
# "We first extract the graph sub-sequences corresponding to the four
# patterns of Canter et al. (1985). We also identified the number of nodes
# to which the learner often goes back to (Fig. 4). These nodes are called
# “central nodes”. If the number of central nodes is lower than or equal to
# half of the sub-sequences, the browsing pattern indicator takes on the
# value “Star”." Bousbia et al. (2010)
alog <- activitylog(tmp[tmp$case == 3448, ],
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
# I do not know how they got the sub-sequences. I am taking the ratio of
# strongly connected nodes to weakly connected nodes. If the number of
# weakly connected nodes is twice as high, the pattern is classified as a
# star, i.e., NodeConnect <= 0.5.
# TODO: This does not make sense, smallest and most frequent number is 3!
# (and I do not understand it...)
process_map(alog)
######################################################################
# count_asymmetric_node_pairs Get the number of asymmetrically-connected node pairs
# count_edges Get a count of all edges
# count_loop_edges Get count of all loop edges
# count_mutual_node_pairs Get the number of mutually-connected node pairs
# count_unconnected_node_pairs Get the number of unconnected node pairs
# count_unconnected_nodes Get count of all unconnected nodes
# TODO: Read up on centrality measures
# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/
# https://www.datacamp.com/tutorial/centrality-network-analysis-R
# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf
# https://link.springer.com/article/10.1007/s10618-024-01003-4
#--------------- (2) Clustering ---------------
df <- na.omit(datcase[, c("duration", "distance", "scaleSize",
"rotationDegree", "length", "nmove",
"nitems", "npaths")])
#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")])
mat <- dist(scale(df))
#mat <- dist(df)
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage (runs quite some time!)
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h1
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
# TODO: Something is wrong
k <- 4
grp <- cutree(hc, k = k)
df$grp <- grp
table(grp)
fviz_cluster(list(data = df, cluster = grp),
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
# "#000000", "#434F4F"),
ellipse.type = "convex",
show.clust.cent = FALSE, ggtheme = theme_bw())
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
nitems, npaths) ~ grp, df, mean)
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
nitems, npaths) ~ grp, df, median)
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
nitems, npaths) ~ grp, df, max)
df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize",
"rotationDegree", "length", "nmove",
"nitems", "npaths")])$case
res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE)
res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
xtabs( ~ item + grp, res)
@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res)
vioplot::vioplot(scaleSize ~ grp, res)
vioplot::vioplot(rotationDegree ~ grp, res)
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
write.table(res,
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
sep = ";",
quote = FALSE,
row.names = FALSE)
#--------------- (3) Investigate variants ---------------
#--------------- (3) Fit tree ---------------
library(rpart)
library(partykit)
dattree$Duration_scaled <- scale(dattree$Duration)
dattree$grp <- factor(dattree$grp)
dattree$Pattern <- factor(dattree$Pattern)
c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
Pattern, data = dattree, method = "class")
plot(as.party(c1))
c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
data = dattree, method = "class")
plot(as.party(c2))
#--------------- (4) Investigate variants ---------------
res$start <- res$date.start
res$complete <- res$date.stop
@ -558,3 +478,30 @@ for (case in cases) {
}
########################### TODO: Still need it?
net <- process_map(alog, render = FALSE)
#DiagrammeR::get_node_df(net)
DiagrammeR::get_node_info(net)
DiagrammeR::get_degree_distribution(net)
DiagrammeR::get_degree_in(net)
DiagrammeR::get_degree_out(net)
DiagrammeR::get_degree_total(net)
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
inet <- DiagrammeR::to_igraph(net)
igraph::centr_degree(inet, loops = FALSE)
igraph::centr_betw(inet)
igraph::centr_clo(inet)