Added partitioning to user navigation script
This commit is contained in:
parent
42f12b9256
commit
e7eb2cb784
@ -4,13 +4,14 @@
|
|||||||
# (1.1) Read log event data
|
# (1.1) Read log event data
|
||||||
# (1.2) Extract additional infos for clustering
|
# (1.2) Extract additional infos for clustering
|
||||||
# (2) Clustering
|
# (2) Clustering
|
||||||
|
# (3) Fit tree
|
||||||
# (3) Investigate variants
|
# (3) Investigate variants
|
||||||
#
|
#
|
||||||
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
|
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
|
||||||
# output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
|
# output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
|
||||||
# results/haum/dattree.csv
|
# results/haum/dattree.csv
|
||||||
#
|
#
|
||||||
# last mod: 2024-02-23
|
# last mod: 2024-02-27
|
||||||
|
|
||||||
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
@ -176,10 +177,6 @@ heatmap(cor_mat)
|
|||||||
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
|
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
|
||||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||||
|
|
||||||
# TODO: Come up with relevant features for navigation behavior
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
dattree <- data.frame(case = datcase$case,
|
dattree <- data.frame(case = datcase$case,
|
||||||
Duration = datcase$duration,
|
Duration = datcase$duration,
|
||||||
PropItems = datcase$nitems / length(unique(dat$item)),
|
PropItems = datcase$nitems / length(unique(dat$item)),
|
||||||
@ -257,47 +254,15 @@ write.table(dattree,
|
|||||||
quote = FALSE,
|
quote = FALSE,
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
|
#--------------- (2) Clustering ---------------
|
||||||
tmp <- dat
|
|
||||||
tmp$start <- tmp$date.start
|
|
||||||
tmp$complete <- tmp$date.stop
|
|
||||||
|
|
||||||
|
|
||||||
alog <- activitylog(tmp[tmp$case == 3448, ],
|
|
||||||
case_id = "case",
|
|
||||||
activity_id = "item",
|
|
||||||
resource_id = "path",
|
|
||||||
timestamps = c("start", "complete"))
|
|
||||||
|
|
||||||
process_map(alog)
|
|
||||||
|
|
||||||
net <- process_map(alog, render = FALSE)
|
|
||||||
#DiagrammeR::get_node_df(net)
|
|
||||||
|
|
||||||
DiagrammeR::get_node_info(net)
|
|
||||||
|
|
||||||
DiagrammeR::get_degree_distribution(net)
|
|
||||||
|
|
||||||
DiagrammeR::get_degree_in(net)
|
|
||||||
DiagrammeR::get_degree_out(net)
|
|
||||||
DiagrammeR::get_degree_total(net)
|
|
||||||
|
|
||||||
|
|
||||||
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
|
|
||||||
|
|
||||||
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
|
|
||||||
|
|
||||||
inet <- DiagrammeR::to_igraph(net)
|
|
||||||
igraph::centr_degree(inet, loops = FALSE)
|
|
||||||
igraph::centr_betw(inet)
|
|
||||||
igraph::centr_clo(inet)
|
|
||||||
|
|
||||||
df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
|
df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
|
||||||
|
# TODO: With or without duration? Why is it relevant?
|
||||||
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
||||||
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
||||||
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
||||||
# scale Duration and min/max SearchInfo
|
# scale Duration and min/max SearchInfo
|
||||||
df$Duration <- scale(df$Duration)
|
df$Duration <- as.numeric(scale(df$Duration))
|
||||||
df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
||||||
(max(df$SearchInfo) - min(df$SearchInfo))
|
(max(df$SearchInfo) - min(df$SearchInfo))
|
||||||
|
|
||||||
@ -319,20 +284,20 @@ c5 <- cophenetic(h5)
|
|||||||
|
|
||||||
# Correlations
|
# Correlations
|
||||||
cor(mat, c1)
|
cor(mat, c1)
|
||||||
# 0.9029232
|
# 0.8854558
|
||||||
cor(mat, c2)
|
cor(mat, c2)
|
||||||
# 0.8879478
|
# 0.883313
|
||||||
cor(mat, c3)
|
cor(mat, c3)
|
||||||
# 0.5747296
|
# 0.5368663
|
||||||
cor(mat, c4)
|
cor(mat, c4)
|
||||||
# 0.5994121
|
# 0.725247
|
||||||
cor(mat, c5)
|
cor(mat, c5)
|
||||||
# 0.5292353
|
# 0.3895215
|
||||||
|
|
||||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||||
|
|
||||||
hc <- h1
|
hc <- h4
|
||||||
|
|
||||||
# Something like a scree plot (??)
|
# Something like a scree plot (??)
|
||||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||||
@ -345,118 +310,51 @@ df$grp <- grp
|
|||||||
table(grp)
|
table(grp)
|
||||||
|
|
||||||
fviz_cluster(list(data = df, cluster = grp),
|
fviz_cluster(list(data = df, cluster = grp),
|
||||||
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
|
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
|
||||||
# "#000000", "#434F4F"),
|
|
||||||
ellipse.type = "convex",
|
ellipse.type = "convex",
|
||||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
show.clust.cent = FALSE,
|
||||||
|
ggtheme = theme_bw())
|
||||||
|
|
||||||
|
# Look at 3d plot to see if clusters are actually separate
|
||||||
|
pc <- prcomp(df)
|
||||||
|
coor <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||||
|
coor$grp <- df$grp
|
||||||
|
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dattree$grp <- grp
|
||||||
|
table(dattree[dattree$grp == 1, "Pattern"])
|
||||||
|
table(dattree[dattree$grp == 2, "Pattern"])
|
||||||
table(dattree[dattree$grp == 3, "Pattern"])
|
table(dattree[dattree$grp == 3, "Pattern"])
|
||||||
|
table(dattree[dattree$grp == 4, "Pattern"])
|
||||||
|
|
||||||
|
|
||||||
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
|
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
|
||||||
Singularity, centr_degree, centr_degree_loops,
|
Singularity, centr_degree, centr_degree_loops,
|
||||||
centr_between) ~ grp, dattree, mean)
|
centr_between) ~ grp, dattree, mean)
|
||||||
|
|
||||||
|
|
||||||
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
|
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
|
||||||
Scholar, Star) ~ grp, df, mean)
|
Scholar, Star) ~ grp, df, mean)
|
||||||
|
|
||||||
|
### Look at selected cases ###########################################
|
||||||
|
tmp <- dat
|
||||||
|
tmp$start <- tmp$date.start
|
||||||
|
tmp$complete <- tmp$date.stop
|
||||||
|
|
||||||
# "We first extract the graph sub-sequences corresponding to the four
|
alog <- activitylog(tmp[tmp$case == 3448, ],
|
||||||
# patterns of Canter et al. (1985). We also identified the number of nodes
|
case_id = "case",
|
||||||
# to which the learner often goes back to (Fig. 4). These nodes are called
|
activity_id = "item",
|
||||||
# “central nodes”. If the number of central nodes is lower than or equal to
|
resource_id = "path",
|
||||||
# half of the sub-sequences, the browsing pattern indicator takes on the
|
timestamps = c("start", "complete"))
|
||||||
# value “Star”." Bousbia et al. (2010)
|
|
||||||
|
|
||||||
# I do not know how they got the sub-sequences. I am taking the ratio of
|
process_map(alog)
|
||||||
# strongly connected nodes to weakly connected nodes. If the number of
|
######################################################################
|
||||||
# weakly connected nodes is twice as high, the pattern is classified as a
|
|
||||||
# star, i.e., NodeConnect <= 0.5.
|
|
||||||
# TODO: This does not make sense, smallest and most frequent number is 3!
|
|
||||||
# (and I do not understand it...)
|
|
||||||
|
|
||||||
|
res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
|
||||||
# count_asymmetric_node_pairs Get the number of asymmetrically-connected node pairs
|
|
||||||
# count_edges Get a count of all edges
|
|
||||||
# count_loop_edges Get count of all loop edges
|
|
||||||
# count_mutual_node_pairs Get the number of mutually-connected node pairs
|
|
||||||
# count_unconnected_node_pairs Get the number of unconnected node pairs
|
|
||||||
# count_unconnected_nodes Get count of all unconnected nodes
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: Read up on centrality measures
|
|
||||||
# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/
|
|
||||||
# https://www.datacamp.com/tutorial/centrality-network-analysis-R
|
|
||||||
# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf
|
|
||||||
# https://link.springer.com/article/10.1007/s10618-024-01003-4
|
|
||||||
|
|
||||||
#--------------- (2) Clustering ---------------
|
|
||||||
|
|
||||||
df <- na.omit(datcase[, c("duration", "distance", "scaleSize",
|
|
||||||
"rotationDegree", "length", "nmove",
|
|
||||||
"nitems", "npaths")])
|
|
||||||
|
|
||||||
#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")])
|
|
||||||
mat <- dist(scale(df))
|
|
||||||
#mat <- dist(df)
|
|
||||||
|
|
||||||
h1 <- hclust(mat, method = "average")
|
|
||||||
h2 <- hclust(mat, method = "complete")
|
|
||||||
h3 <- hclust(mat, method = "ward.D")
|
|
||||||
h4 <- hclust(mat, method = "ward.D2")
|
|
||||||
h5 <- hclust(mat, method = "single")
|
|
||||||
|
|
||||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
|
||||||
c1 <- cophenetic(h1)
|
|
||||||
c2 <- cophenetic(h2)
|
|
||||||
c3 <- cophenetic(h3)
|
|
||||||
c4 <- cophenetic(h4)
|
|
||||||
c5 <- cophenetic(h5)
|
|
||||||
|
|
||||||
# Correlations
|
|
||||||
cor(mat, c1)
|
|
||||||
cor(mat, c2)
|
|
||||||
cor(mat, c3)
|
|
||||||
cor(mat, c4)
|
|
||||||
cor(mat, c5)
|
|
||||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
|
||||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
|
||||||
|
|
||||||
hc <- h1
|
|
||||||
|
|
||||||
# Something like a scree plot (??)
|
|
||||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: Something is wrong
|
|
||||||
|
|
||||||
k <- 4
|
|
||||||
|
|
||||||
grp <- cutree(hc, k = k)
|
|
||||||
df$grp <- grp
|
|
||||||
|
|
||||||
table(grp)
|
|
||||||
|
|
||||||
fviz_cluster(list(data = df, cluster = grp),
|
|
||||||
#palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
|
|
||||||
# "#000000", "#434F4F"),
|
|
||||||
ellipse.type = "convex",
|
|
||||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
|
|
||||||
nitems, npaths) ~ grp, df, mean)
|
|
||||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
|
|
||||||
nitems, npaths) ~ grp, df, median)
|
|
||||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
|
|
||||||
nitems, npaths) ~ grp, df, max)
|
|
||||||
|
|
||||||
|
|
||||||
df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize",
|
|
||||||
"rotationDegree", "length", "nmove",
|
|
||||||
"nitems", "npaths")])$case
|
|
||||||
|
|
||||||
res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE)
|
|
||||||
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||||
|
|
||||||
xtabs( ~ item + grp, res)
|
xtabs( ~ item + grp, res)
|
||||||
@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res)
|
|||||||
vioplot::vioplot(scaleSize ~ grp, res)
|
vioplot::vioplot(scaleSize ~ grp, res)
|
||||||
vioplot::vioplot(rotationDegree ~ grp, res)
|
vioplot::vioplot(rotationDegree ~ grp, res)
|
||||||
|
|
||||||
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
|
||||||
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
|
||||||
|
|
||||||
write.table(res,
|
write.table(res,
|
||||||
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
|
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
|
||||||
sep = ";",
|
sep = ";",
|
||||||
quote = FALSE,
|
quote = FALSE,
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
#--------------- (3) Investigate variants ---------------
|
#--------------- (3) Fit tree ---------------
|
||||||
|
|
||||||
|
library(rpart)
|
||||||
|
library(partykit)
|
||||||
|
|
||||||
|
dattree$Duration_scaled <- scale(dattree$Duration)
|
||||||
|
dattree$grp <- factor(dattree$grp)
|
||||||
|
dattree$Pattern <- factor(dattree$Pattern)
|
||||||
|
|
||||||
|
c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
|
||||||
|
Pattern, data = dattree, method = "class")
|
||||||
|
|
||||||
|
plot(as.party(c1))
|
||||||
|
|
||||||
|
c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
|
||||||
|
data = dattree, method = "class")
|
||||||
|
|
||||||
|
plot(as.party(c2))
|
||||||
|
|
||||||
|
#--------------- (4) Investigate variants ---------------
|
||||||
|
|
||||||
res$start <- res$date.start
|
res$start <- res$date.start
|
||||||
res$complete <- res$date.stop
|
res$complete <- res$date.stop
|
||||||
@ -558,3 +478,30 @@ for (case in cases) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
########################### TODO: Still need it?
|
||||||
|
|
||||||
|
|
||||||
|
net <- process_map(alog, render = FALSE)
|
||||||
|
#DiagrammeR::get_node_df(net)
|
||||||
|
|
||||||
|
DiagrammeR::get_node_info(net)
|
||||||
|
|
||||||
|
DiagrammeR::get_degree_distribution(net)
|
||||||
|
|
||||||
|
DiagrammeR::get_degree_in(net)
|
||||||
|
DiagrammeR::get_degree_out(net)
|
||||||
|
DiagrammeR::get_degree_total(net)
|
||||||
|
|
||||||
|
|
||||||
|
N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes
|
||||||
|
|
||||||
|
dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
|
||||||
|
|
||||||
|
inet <- DiagrammeR::to_igraph(net)
|
||||||
|
igraph::centr_degree(inet, loops = FALSE)
|
||||||
|
igraph::centr_betw(inet)
|
||||||
|
igraph::centr_clo(inet)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user