Added partitioning to user navigation script

This commit is contained in:
Nora Wickelmaier 2024-02-27 17:28:01 +01:00
parent 42f12b9256
commit e7eb2cb784

View File

@ -4,13 +4,14 @@
# (1.1) Read log event data # (1.1) Read log event data
# (1.2) Extract additional infos for clustering # (1.2) Extract additional infos for clustering
# (2) Clustering # (2) Clustering
# (3) Fit tree
# (3) Investigate variants # (3) Investigate variants
# #
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
# output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
# results/haum/dattree.csv # results/haum/dattree.csv
# #
# last mod: 2024-02-23 # last mod: 2024-02-27
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
@ -176,10 +177,6 @@ heatmap(cor_mat)
# datcase$duration / mean(datcase$duration) or median(datcase$duration) # datcase$duration / mean(datcase$duration) or median(datcase$duration)
# * Duration per artwork is low: "ave_duration_item" / datcase$duration # * Duration per artwork is low: "ave_duration_item" / datcase$duration
# TODO: Come up with relevant features for navigation behavior
dattree <- data.frame(case = datcase$case, dattree <- data.frame(case = datcase$case,
Duration = datcase$duration, Duration = datcase$duration,
PropItems = datcase$nitems / length(unique(dat$item)), PropItems = datcase$nitems / length(unique(dat$item)),
@ -257,47 +254,15 @@ write.table(dattree,
quote = FALSE, quote = FALSE,
row.names = FALSE) row.names = FALSE)
#--------------- (2) Clustering ---------------
tmp <- dat
tmp$start <- tmp$date.start
tmp$complete <- tmp$date.stop
alog <- activitylog(tmp[tmp$case == 3448, ],
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
process_map(alog)
net <- process_map(alog, render = FALSE)
#DiagrammeR::get_node_df(net)
DiagrammeR::get_node_info(net)
DiagrammeR::get_degree_distribution(net)
DiagrammeR::get_degree_in(net)
DiagrammeR::get_degree_out(net)
DiagrammeR::get_degree_total(net)
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
inet <- DiagrammeR::to_igraph(net)
igraph::centr_degree(inet, loops = FALSE)
igraph::centr_betw(inet)
igraph::centr_clo(inet)
df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")] df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
# TODO: With or without duration? Why is it relevant?
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
# scale Duration and min/max SearchInfo # scale Duration and min/max SearchInfo
df$Duration <- scale(df$Duration) df$Duration <- as.numeric(scale(df$Duration))
df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
(max(df$SearchInfo) - min(df$SearchInfo)) (max(df$SearchInfo) - min(df$SearchInfo))
@ -319,20 +284,20 @@ c5 <- cophenetic(h5)
# Correlations # Correlations
cor(mat, c1) cor(mat, c1)
# 0.9029232 # 0.8854558
cor(mat, c2) cor(mat, c2)
# 0.8879478 # 0.883313
cor(mat, c3) cor(mat, c3)
# 0.5747296 # 0.5368663
cor(mat, c4) cor(mat, c4)
# 0.5994121 # 0.725247
cor(mat, c5) cor(mat, c5)
# 0.5292353 # 0.3895215
# https://en.wikipedia.org/wiki/Cophenetic_correlation # https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h1 hc <- h4
# Something like a scree plot (??) # Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -345,118 +310,51 @@ df$grp <- grp
table(grp) table(grp)
fviz_cluster(list(data = df, cluster = grp), fviz_cluster(list(data = df, cluster = grp),
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
# "#000000", "#434F4F"),
ellipse.type = "convex", ellipse.type = "convex",
show.clust.cent = FALSE, ggtheme = theme_bw()) show.clust.cent = FALSE,
ggtheme = theme_bw())
# Look at 3d plot to see if clusters are actually separate
pc <- prcomp(df)
coor <- as.data.frame(pc$x[, c(1, 2, 3)])
coor$grp <- df$grp
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
dattree$grp <- grp
table(dattree[dattree$grp == 1, "Pattern"])
table(dattree[dattree$grp == 2, "Pattern"])
table(dattree[dattree$grp == 3, "Pattern"]) table(dattree[dattree$grp == 3, "Pattern"])
table(dattree[dattree$grp == 4, "Pattern"])
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
Singularity, centr_degree, centr_degree_loops, Singularity, centr_degree, centr_degree_loops,
centr_between) ~ grp, dattree, mean) centr_between) ~ grp, dattree, mean)
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion, aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
Scholar, Star) ~ grp, df, mean) Scholar, Star) ~ grp, df, mean)
### Look at selected cases ###########################################
tmp <- dat
tmp$start <- tmp$date.start
tmp$complete <- tmp$date.stop
# "We first extract the graph sub-sequences corresponding to the four alog <- activitylog(tmp[tmp$case == 3448, ],
# patterns of Canter et al. (1985). We also identified the number of nodes case_id = "case",
# to which the learner often goes back to (Fig. 4). These nodes are called activity_id = "item",
# “central nodes”. If the number of central nodes is lower than or equal to resource_id = "path",
# half of the sub-sequences, the browsing pattern indicator takes on the timestamps = c("start", "complete"))
# value “Star”." Bousbia et al. (2010)
# I do not know how they got the sub-sequences. I am taking the ratio of process_map(alog)
# strongly connected nodes to weakly connected nodes. If the number of ######################################################################
# weakly connected nodes is twice as high, the pattern is classified as a
# star, i.e., NodeConnect <= 0.5.
# TODO: This does not make sense, smallest and most frequent number is 3!
# (and I do not understand it...)
res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
# count_asymmetric_node_pairs Get the number of asymmetrically-connected node pairs
# count_edges Get a count of all edges
# count_loop_edges Get count of all loop edges
# count_mutual_node_pairs Get the number of mutually-connected node pairs
# count_unconnected_node_pairs Get the number of unconnected node pairs
# count_unconnected_nodes Get count of all unconnected nodes
# TODO: Read up on centrality measures
# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/
# https://www.datacamp.com/tutorial/centrality-network-analysis-R
# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf
# https://link.springer.com/article/10.1007/s10618-024-01003-4
#--------------- (2) Clustering ---------------
df <- na.omit(datcase[, c("duration", "distance", "scaleSize",
"rotationDegree", "length", "nmove",
"nitems", "npaths")])
#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")])
mat <- dist(scale(df))
#mat <- dist(df)
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage (runs quite some time!)
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
hc <- h1
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
# TODO: Something is wrong
k <- 4
grp <- cutree(hc, k = k)
df$grp <- grp
table(grp)
fviz_cluster(list(data = df, cluster = grp),
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
# "#000000", "#434F4F"),
ellipse.type = "convex",
show.clust.cent = FALSE, ggtheme = theme_bw())
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
nitems, npaths) ~ grp, df, mean)
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
nitems, npaths) ~ grp, df, median)
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
nitems, npaths) ~ grp, df, max)
df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize",
"rotationDegree", "length", "nmove",
"nitems", "npaths")])$case
res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
xtabs( ~ item + grp, res) xtabs( ~ item + grp, res)
@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res)
vioplot::vioplot(scaleSize ~ grp, res) vioplot::vioplot(scaleSize ~ grp, res)
vioplot::vioplot(rotationDegree ~ grp, res) vioplot::vioplot(rotationDegree ~ grp, res)
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
write.table(res, write.table(res,
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv", file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
sep = ";", sep = ";",
quote = FALSE, quote = FALSE,
row.names = FALSE) row.names = FALSE)
#--------------- (3) Investigate variants --------------- #--------------- (3) Fit tree ---------------
library(rpart)
library(partykit)
dattree$Duration_scaled <- scale(dattree$Duration)
dattree$grp <- factor(dattree$grp)
dattree$Pattern <- factor(dattree$Pattern)
c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
Pattern, data = dattree, method = "class")
plot(as.party(c1))
c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
data = dattree, method = "class")
plot(as.party(c2))
#--------------- (4) Investigate variants ---------------
res$start <- res$date.start res$start <- res$date.start
res$complete <- res$date.stop res$complete <- res$date.stop
@ -558,3 +478,30 @@ for (case in cases) {
} }
########################### TODO: Still need it?
net <- process_map(alog, render = FALSE)
#DiagrammeR::get_node_df(net)
DiagrammeR::get_node_info(net)
DiagrammeR::get_degree_distribution(net)
DiagrammeR::get_degree_in(net)
DiagrammeR::get_degree_out(net)
DiagrammeR::get_degree_total(net)
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
inet <- DiagrammeR::to_igraph(net)
igraph::centr_degree(inet, loops = FALSE)
igraph::centr_betw(inet)
igraph::centr_clo(inet)