Played around with clustering; switched to glower distances and removed kmeans clustering
This commit is contained in:
parent
ea0660817a
commit
f8c1767074
@ -74,10 +74,12 @@ topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6]
|
|||||||
topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7]
|
topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7]
|
||||||
topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8]
|
topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8]
|
||||||
|
|
||||||
datcase <- datcase |>
|
datcase <- datcase |>
|
||||||
merge(eventtab, by = "case", all = TRUE) |>
|
merge(eventtab, by = "case", all = TRUE) |>
|
||||||
merge(topictab, by = "case", all = TRUE)
|
merge(topictab, by = "case", all = TRUE)
|
||||||
|
|
||||||
|
rm(eventtab, topictab)
|
||||||
|
|
||||||
datcase$ntopiccards <- aggregate(topic ~ case, dat,
|
datcase$ntopiccards <- aggregate(topic ~ case, dat,
|
||||||
function(x) ifelse(all(is.na(x)), NA,
|
function(x) ifelse(all(is.na(x)), NA,
|
||||||
length(na.omit(x))), na.action =
|
length(na.omit(x))), na.action =
|
||||||
@ -136,6 +138,7 @@ check_infocards <- function(subdata, artworks) {
|
|||||||
}
|
}
|
||||||
as.numeric(infocard_only)
|
as.numeric(infocard_only)
|
||||||
}
|
}
|
||||||
|
# TODO: Move to helper file
|
||||||
|
|
||||||
artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
|
artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
|
||||||
|
|
||||||
@ -167,214 +170,43 @@ heatmap(cor_mat)
|
|||||||
normalize <- function(x) {
|
normalize <- function(x) {
|
||||||
(x - min(x)) / (max(x) - min(x))
|
(x - min(x)) / (max(x) - min(x))
|
||||||
}
|
}
|
||||||
|
# TODO: Move to helper file
|
||||||
|
|
||||||
#df <- as.data.frame(lapply(datcase[, -1], normalize))
|
|
||||||
df <- as.data.frame(lapply(datcase[, -1], scale))
|
|
||||||
#df <- datcase[, -1]
|
|
||||||
|
|
||||||
# "Flatten" with PCA
|
|
||||||
pc <- prcomp(df)
|
|
||||||
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
|
||||||
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
|
||||||
|
|
||||||
plot(coor_2d)
|
|
||||||
rgl::plot3d(coor_3d)
|
|
||||||
|
|
||||||
#--------------- (2.1) K-Means clustering ---------------
|
|
||||||
|
|
||||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
|
||||||
|
|
||||||
k1 <- kmeans(df, 4)
|
|
||||||
|
|
||||||
grp_km <- k1$cluster
|
|
||||||
table(grp_km)
|
|
||||||
|
|
||||||
fviz_cluster(list(data = df, cluster = grp_km),
|
|
||||||
palette = mycols,
|
|
||||||
ellipse.type = "convex",
|
|
||||||
show.clust.cent = FALSE,
|
|
||||||
ggtheme = theme_bw())
|
|
||||||
|
|
||||||
plot(coor_2d, col = mycols[grp_km])
|
|
||||||
|
|
||||||
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
|
||||||
|
|
||||||
aggregate(. ~ grp_km, df, mean)
|
|
||||||
|
|
||||||
#--------------- (2.2) Hierarchical clustering ---------------
|
|
||||||
|
|
||||||
mat <- dist(df)
|
|
||||||
|
|
||||||
h1 <- hclust(mat, method = "average")
|
|
||||||
h2 <- hclust(mat, method = "complete")
|
|
||||||
h3 <- hclust(mat, method = "ward.D")
|
|
||||||
h4 <- hclust(mat, method = "ward.D2")
|
|
||||||
h5 <- hclust(mat, method = "single")
|
|
||||||
|
|
||||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
|
||||||
c1 <- cophenetic(h1)
|
|
||||||
c2 <- cophenetic(h2)
|
|
||||||
c3 <- cophenetic(h3)
|
|
||||||
c4 <- cophenetic(h4)
|
|
||||||
c5 <- cophenetic(h5)
|
|
||||||
|
|
||||||
# Correlations
|
|
||||||
cor(mat, c1)
|
|
||||||
cor(mat, c2)
|
|
||||||
cor(mat, c3)
|
|
||||||
cor(mat, c4)
|
|
||||||
cor(mat, c5)
|
|
||||||
|
|
||||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
|
||||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
|
||||||
|
|
||||||
hc <- h4
|
|
||||||
|
|
||||||
# Something like a scree plot (??)
|
|
||||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
|
||||||
|
|
||||||
k <- 4
|
|
||||||
|
|
||||||
grp_hclust <- cutree(hc, k = k)
|
|
||||||
|
|
||||||
table(grp_hclust)
|
|
||||||
|
|
||||||
fviz_cluster(list(data = df, cluster = grp_hclust),
|
|
||||||
palette = mycols,
|
|
||||||
ellipse.type = "convex",
|
|
||||||
show.clust.cent = FALSE,
|
|
||||||
ggtheme = theme_bw())
|
|
||||||
|
|
||||||
plot(coor_2d, col = mycols[grp_hclust])
|
|
||||||
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
|
||||||
|
|
||||||
table(dattree[grp_hclust == 1, "Pattern"])
|
|
||||||
table(dattree[grp_hclust == 2, "Pattern"])
|
|
||||||
table(dattree[grp_hclust == 3, "Pattern"])
|
|
||||||
table(dattree[grp_hclust == 4, "Pattern"])
|
|
||||||
|
|
||||||
|
|
||||||
aggregate(. ~ grp_hclust, df, mean)
|
|
||||||
|
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
|
||||||
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
|
||||||
mean)
|
|
||||||
|
|
||||||
#--------------- (2.3) DBSCAN clustering ---------------
|
|
||||||
|
|
||||||
library(dbscan)
|
|
||||||
d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
|
|
||||||
hullplot(df, d1)
|
|
||||||
|
|
||||||
grp_db <- d1$cluster
|
|
||||||
table(grp_db)
|
|
||||||
|
|
||||||
kNNdistplot(df, k = ncol(df))
|
|
||||||
abline(h = 0.2, col = "red")
|
|
||||||
abline(h = 0.06, col = "red")
|
|
||||||
|
|
||||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
|
||||||
#palette = mycols,
|
|
||||||
ellipse.type = "convex",
|
|
||||||
show.clust.cent = FALSE,
|
|
||||||
ggtheme = theme_bw())
|
|
||||||
|
|
||||||
mycols <- c("black", mycols)
|
|
||||||
|
|
||||||
plot(coor_2d, col = mycols[grp_db + 1])
|
|
||||||
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
|
||||||
|
|
||||||
aggregate(. ~ grp_db, df, mean)
|
|
||||||
|
|
||||||
table(dattree[grp_db == 0, "Pattern"])
|
|
||||||
table(dattree[grp_db == 1, "Pattern"])
|
|
||||||
table(dattree[grp_db == 2, "Pattern"])
|
|
||||||
table(dattree[grp_db == 3, "Pattern"])
|
|
||||||
|
|
||||||
# Does not really work with these features!
|
|
||||||
|
|
||||||
|
|
||||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
|
||||||
|
|
||||||
# Navigation types by Bousbia et al. (2010):
|
|
||||||
# - Overviewing: this value is close to the Canter “scanning” value. It
|
|
||||||
# implies that the learner is covering a large proportion of course pages.
|
|
||||||
# Through this phase of fast-reading, the user seeks to acquire an
|
|
||||||
# overall view of the course.
|
|
||||||
# - Flitting: close to “wandering”. It reflects a browsing activity without a
|
|
||||||
# strategy or a particular goal. The main difference with the overviewing
|
|
||||||
# type is the lack of focus on the course.
|
|
||||||
# - Studying: corresponds to a partial or complete reading of the course
|
|
||||||
# pages where the learner spends time on each page.
|
|
||||||
# - Deepening: This describes a learner who spends relatively long time on a
|
|
||||||
# course, checking details, and seeking Web documents related to the course
|
|
||||||
# topics. The main difference with studying is the Web search part that the
|
|
||||||
# learner uses to obtain a deeper understanding of the course.
|
|
||||||
|
|
||||||
# Taxonomy defined by Canter et al. (1985):
|
|
||||||
# - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia)
|
|
||||||
# by requesting an important proportion of its pages but without spending
|
|
||||||
# much time on them.
|
|
||||||
# - Browsing: going wherever the data leads the navigator until catching an
|
|
||||||
# interest.
|
|
||||||
# - Exploring: reading the viewed pages thoroughly.
|
|
||||||
# - Searching: seeking for a particular document or information.
|
|
||||||
# - Wandering: navigating in an unstructured fashion without any particular
|
|
||||||
# goal or strategy.
|
|
||||||
|
|
||||||
# Features for navigation types for MTT:
|
# Features for navigation types for MTT:
|
||||||
# - Scanning / Overviewing:
|
# - Scanning / Overviewing:
|
||||||
# * Proportion of artworks looked at is high: datcase$nitems / 70
|
# * Proportion of artworks looked at is high
|
||||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||||
# - Exploring:
|
# - Exploring:
|
||||||
# * Looking at additional information for most items touched (high value):
|
# * Looking at additional information is high
|
||||||
# harmonic mean of datcase$nopenTopic / datcase$nflipCard and
|
|
||||||
# datcase$nopenPopup / datcase$nopenTopic
|
|
||||||
# - Searching / Studying:
|
# - Searching / Studying:
|
||||||
# * Looking only at a few items
|
# * Proportion of artworks looked at is low
|
||||||
# datcase$nitems / 70 is low
|
|
||||||
# * Opening few cards
|
# * Opening few cards
|
||||||
# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low
|
# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low
|
||||||
# * but for most cards popups are opened:
|
# * but for most cards popups are opened:
|
||||||
# datcase$nopenPopup / datcase$nflipCard is high
|
# datcase$nopenPopup / datcase$nflipCard is high
|
||||||
# - Wandering / Flitting:
|
# - Wandering / Flitting:
|
||||||
# * Items are mostly just moved:
|
# * Proportion of moves is high
|
||||||
# datcase$nmove / datcase$length is high
|
|
||||||
# * Duration per case is low:
|
# * Duration per case is low:
|
||||||
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
|
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
|
||||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||||
|
|
||||||
dattree <- data.frame(case = datcase$case,
|
dattree <- data.frame(case = datcase$case,
|
||||||
#Duration = datcase$duration,
|
NumItems = datcase$nitems,
|
||||||
PropItems = datcase$nitems / length(unique(dat$item)),
|
NumTopic = datcase$nopenTopic,
|
||||||
#PropTopic = datcase$nopenTopic / datcase$nflipCard,
|
NumPopup = datcase$nopenPopup,
|
||||||
#PropPopup = datcase$nopenPopup / datcase$nopenTopic,
|
|
||||||
# SearchInfo =
|
|
||||||
# 2*(((datcase$nopenPopup / datcase$nopenTopic) *
|
|
||||||
# (datcase$nopenTopic / datcase$nflipCard)) /
|
|
||||||
# ((datcase$nopenPopup / datcase$nopenTopic) +
|
|
||||||
# (datcase$nopenTopic / datcase$nflipCard))
|
|
||||||
# ),
|
|
||||||
SearchInfo = datcase$nopenTopic / datcase$nflipCard +
|
|
||||||
datcase$nopenPopup / datcase$nopenTopic,
|
|
||||||
PropMoves = datcase$nmove / datcase$length,
|
PropMoves = datcase$nmove / datcase$length,
|
||||||
PathLinearity = datcase$nitems / datcase$npaths,
|
PathLinearity = datcase$nitems / datcase$npaths,
|
||||||
Singularity = datcase$npaths / datcase$length
|
Singularity = datcase$npaths / datcase$length
|
||||||
)
|
)
|
||||||
|
|
||||||
#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
|
dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
|
||||||
dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
|
dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
|
||||||
|
|
||||||
#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
|
|
||||||
#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
|
|
||||||
|
|
||||||
get_centrality <- function(case, data) {
|
get_centrality <- function(case, data) {
|
||||||
|
|
||||||
data$start <- data$date.start
|
data$start <- data$date.start
|
||||||
data$complete <- data$date.stop
|
data$complete <- data$date.stop
|
||||||
|
|
||||||
alog <- activitylog(data[data$case == case, ],
|
alog <- activitylog(data[data$case == case, ],
|
||||||
case_id = "case",
|
case_id = "case",
|
||||||
activity_id = "item",
|
activity_id = "item",
|
||||||
@ -388,17 +220,16 @@ get_centrality <- function(case, data) {
|
|||||||
igraph::centr_degree(inet, loops = TRUE)$centralization,
|
igraph::centr_degree(inet, loops = TRUE)$centralization,
|
||||||
igraph::centr_betw(inet)$centralization)
|
igraph::centr_betw(inet)$centralization)
|
||||||
}
|
}
|
||||||
|
# TODO: Move to helper file
|
||||||
|
|
||||||
|
# centrality <- lapply(dattree$case, get_centrality, data = dat)
|
||||||
|
# centrality <- do.call(rbind, centrality)
|
||||||
|
#
|
||||||
|
# save(centrality, file = "results/haum/tmp_centrality.RData")
|
||||||
|
load("results/haum/tmp_centrality.RData")
|
||||||
|
|
||||||
centrality <- lapply(dattree$case, get_centrality, data = dat)
|
#dattree$DegreeCentrality <- centrality[, 2]
|
||||||
centrality <- do.call(rbind, centrality)
|
dattree$BetweenCentrality <- centrality[, 3]
|
||||||
|
|
||||||
save(centrality, file = "results/haum/tmp_centrality.RData")
|
|
||||||
|
|
||||||
#dattree$centr_degree <- centrality[, 1]
|
|
||||||
#dattree$centr_degree_loops <- centrality[, 2]
|
|
||||||
dattree$DegreeCentrality <- centrality[, 2]
|
|
||||||
#dattree$BetweenCentrality <- centrality[, 3]
|
|
||||||
|
|
||||||
## Add average duration per item
|
## Add average duration per item
|
||||||
|
|
||||||
@ -413,135 +244,118 @@ tmp$duration <- tmp$max_time - tmp$min_time
|
|||||||
tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case
|
tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case
|
||||||
|
|
||||||
dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
|
dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
|
||||||
|
#dattree$AvDurItem <- dattree$AvDurItem / datcase$duration
|
||||||
|
|
||||||
rm(tmp)
|
rm(tmp)
|
||||||
|
|
||||||
|
summary(dattree)
|
||||||
|
|
||||||
plot(dattree[, -1], pch = ".")
|
plot(dattree[, -1], pch = ".")
|
||||||
|
|
||||||
par(mfrow = c(3,4))
|
par(mfrow = c(2,4))
|
||||||
hist(dattree$Duration, breaks = 50, main = "")
|
|
||||||
hist(dattree$AvDurItem, breaks = 50, main = "")
|
hist(dattree$AvDurItem, breaks = 50, main = "")
|
||||||
hist(dattree$PropItems, breaks = 50, main = "")
|
hist(dattree$NumItems, breaks = 50, main = "")
|
||||||
hist(dattree$PropTopic, breaks = 50, main = "")
|
hist(dattree$NumTopic, breaks = 50, main = "")
|
||||||
hist(dattree$PropPopup, breaks = 50, main = "")
|
hist(dattree$NumPopup, breaks = 50, main = "")
|
||||||
hist(dattree$PropMoves, breaks = 50, main = "")
|
hist(dattree$PropMoves, breaks = 50, main = "")
|
||||||
hist(dattree$PathLinearity, breaks = 50, main = "")
|
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||||
hist(dattree$Singularity, breaks = 50, main = "")
|
hist(dattree$Singularity, breaks = 50, main = "")
|
||||||
hist(dattree$centr_degree, breaks = 50, main = "")
|
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||||
hist(dattree$centr_degree_loops, breaks = 50, main = "")
|
|
||||||
hist(dattree$centr_between, breaks = 50, main = "")
|
# Indicator variable if table was used as info terminal only
|
||||||
|
dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, labels = c("no", "yes"))
|
||||||
|
|
||||||
|
# Add pattern to datcase; loosely based on Bousbia et al. (2009)
|
||||||
|
dattree$Pattern <- "Dispersion"
|
||||||
|
dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar",
|
||||||
|
dattree$Pattern)
|
||||||
|
dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
||||||
|
dattree$BetweenCentrality > 0.5, "Star",
|
||||||
|
dattree$Pattern)
|
||||||
|
dattree$Pattern <- factor(dattree$Pattern)
|
||||||
|
|
||||||
|
|
||||||
cor_mat <- cor(dattree[, -1], use = "pairwise")
|
# Remove cases with extreme outliers
|
||||||
diag(cor_mat) <- NA
|
# TODO: Do I want this???
|
||||||
heatmap(cor_mat)
|
|
||||||
|
|
||||||
|
quantile(datcase$nopenTopic, 0.999)
|
||||||
|
quantile(datcase$nopenPopup, 0.999)
|
||||||
|
|
||||||
# dattree$Pattern <- "Dispersion"
|
dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ]
|
||||||
# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
|
|
||||||
# dattree$Singularity > 0.8, "Scholar",
|
plot(dattree[, -1], pch = ".")
|
||||||
# dattree$Pattern)
|
|
||||||
# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
par(mfrow = c(2,4))
|
||||||
# dattree$centr_between > 0.5, "Star",
|
hist(dattree$AvDurItem, breaks = 50, main = "")
|
||||||
# dattree$Pattern)
|
hist(dattree$NumItems, breaks = 50, main = "")
|
||||||
|
hist(dattree$NumTopic, breaks = 50, main = "")
|
||||||
|
hist(dattree$NumPopup, breaks = 50, main = "")
|
||||||
|
hist(dattree$PropMoves, breaks = 50, main = "")
|
||||||
|
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||||
|
hist(dattree$Singularity, breaks = 50, main = "")
|
||||||
|
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||||
|
|
||||||
write.table(dattree,
|
|
||||||
file = "results/haum/dattree.csv",
|
|
||||||
sep = ";",
|
|
||||||
quote = FALSE,
|
|
||||||
row.names = FALSE)
|
|
||||||
|
|
||||||
#--------------- (2) Clustering ---------------
|
#--------------- (2) Clustering ---------------
|
||||||
|
|
||||||
df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
|
library(cluster)
|
||||||
#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
|
|
||||||
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
|
||||||
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
|
||||||
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
|
||||||
|
|
||||||
# scale Duration and min/max SearchInfo
|
df <- dattree[1:10000, -1] # remove case variable
|
||||||
#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
|
# TODO: Do I need to scale or does normalization also work?
|
||||||
df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
|
|
||||||
(max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
|
# Normalize Duration and Numbers
|
||||||
#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
# df$AvDurItem <- normalize(df$AvDurItem)
|
||||||
# (max(df$SearchInfo) - min(df$SearchInfo))
|
# df$NumItems <- normalize(df$NumItems)
|
||||||
df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
|
# df$NumTopic <- normalize(df$NumTopic)
|
||||||
(max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE))
|
# df$NumPopup <- normalize(df$NumPopup)
|
||||||
df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
|
|
||||||
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
|
# summary(df)
|
||||||
|
|
||||||
|
# Look at collinearity
|
||||||
|
cor_mat <- cor(df)
|
||||||
|
diag(cor_mat) <- NA
|
||||||
|
heatmap(cor_mat)
|
||||||
|
|
||||||
|
#df <- as.data.frame(scale(dattree[, -1]))
|
||||||
|
|
||||||
|
|
||||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
#--------------- (2.2) Hierarchical clustering ---------------
|
||||||
df <- dattree[, -1]
|
|
||||||
df$AvDurItem <- normalize(df$AvDurItem)
|
|
||||||
df$SearchInfo <- normalize(df$SearchInfo)
|
|
||||||
df$InfocardOnly <- datcase$infocardOnly
|
|
||||||
summary(df)
|
|
||||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
|
||||||
|
|
||||||
# "Flatten" with PCA
|
mat <- daisy(df, metric = "gower")
|
||||||
pc <- prcomp(df)
|
|
||||||
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
# "Flatten" with MDS
|
||||||
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
coor_2d <- as.data.frame(cmdscale(mat, k = 2))
|
||||||
|
coor_3d <- as.data.frame(cmdscale(mat, k = 3))
|
||||||
|
|
||||||
plot(coor_2d)
|
plot(coor_2d)
|
||||||
rgl::plot3d(coor_3d)
|
rgl::plot3d(coor_3d)
|
||||||
|
|
||||||
#--------------- (2.1) K-Means clustering ---------------
|
#mat <- dist(df)
|
||||||
|
|
||||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
# https://uc-r.github.io/hc_clustering
|
||||||
|
method <- c(average = "average", single = "single", complete = "complete",
|
||||||
|
ward = "ward.D2")
|
||||||
|
|
||||||
k1 <- kmeans(df, 4)
|
hc_method <- function(x) {
|
||||||
|
hclust(mat, method = x)
|
||||||
|
}
|
||||||
|
|
||||||
grp_km <- k1$cluster
|
hcs <- lapply(method, hc_method)
|
||||||
table(grp_km)
|
cds <- lapply(hcs, cophenetic)
|
||||||
|
cors <- sapply(cds, cor, y = mat)
|
||||||
fviz_cluster(list(data = df, cluster = grp_km),
|
|
||||||
palette = mycols,
|
|
||||||
ellipse.type = "convex",
|
|
||||||
show.clust.cent = FALSE,
|
|
||||||
ggtheme = theme_bw())
|
|
||||||
|
|
||||||
plot(coor_2d, col = mycols[grp_km])
|
|
||||||
|
|
||||||
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
|
||||||
|
|
||||||
aggregate(. ~ grp_km, df, mean)
|
|
||||||
|
|
||||||
#--------------- (2.2) Hierarchical clustering ---------------
|
|
||||||
|
|
||||||
mat <- dist(df)
|
|
||||||
|
|
||||||
h1 <- hclust(mat, method = "average")
|
|
||||||
h2 <- hclust(mat, method = "complete")
|
|
||||||
h3 <- hclust(mat, method = "ward.D")
|
|
||||||
h4 <- hclust(mat, method = "ward.D2")
|
|
||||||
h5 <- hclust(mat, method = "single")
|
|
||||||
|
|
||||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
|
||||||
c1 <- cophenetic(h1)
|
|
||||||
c2 <- cophenetic(h2)
|
|
||||||
c3 <- cophenetic(h3)
|
|
||||||
c4 <- cophenetic(h4)
|
|
||||||
c5 <- cophenetic(h5)
|
|
||||||
|
|
||||||
# Correlations
|
|
||||||
cor(mat, c1)
|
|
||||||
cor(mat, c2)
|
|
||||||
cor(mat, c3)
|
|
||||||
cor(mat, c4)
|
|
||||||
cor(mat, c5)
|
|
||||||
|
|
||||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||||
|
|
||||||
hc <- h4
|
hc <- hcs$average
|
||||||
|
|
||||||
# Something like a scree plot (??)
|
# Something like a scree plot (??)
|
||||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||||
|
|
||||||
k <- 4
|
k <- 4
|
||||||
|
|
||||||
|
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||||
|
|
||||||
grp_hclust <- cutree(hc, k = k)
|
grp_hclust <- cutree(hc, k = k)
|
||||||
|
|
||||||
table(grp_hclust)
|
table(grp_hclust)
|
||||||
@ -553,17 +367,16 @@ fviz_cluster(list(data = df, cluster = grp_hclust),
|
|||||||
ggtheme = theme_bw())
|
ggtheme = theme_bw())
|
||||||
|
|
||||||
plot(coor_2d, col = mycols[grp_hclust])
|
plot(coor_2d, col = mycols[grp_hclust])
|
||||||
|
legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21)
|
||||||
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
||||||
|
|
||||||
table(dattree[grp_hclust == 1, "Pattern"])
|
table(datcase[grp_hclust == 1, "Pattern"])
|
||||||
table(dattree[grp_hclust == 2, "Pattern"])
|
table(datcase[grp_hclust == 2, "Pattern"])
|
||||||
table(dattree[grp_hclust == 3, "Pattern"])
|
table(datcase[grp_hclust == 3, "Pattern"])
|
||||||
table(dattree[grp_hclust == 4, "Pattern"])
|
table(datcase[grp_hclust == 4, "Pattern"])
|
||||||
|
|
||||||
|
|
||||||
aggregate(. ~ grp_hclust, df, mean)
|
aggregate(. ~ grp_hclust, df, mean)
|
||||||
|
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||||
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
||||||
mean)
|
mean)
|
||||||
@ -571,7 +384,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
|||||||
#--------------- (2.3) DBSCAN clustering ---------------
|
#--------------- (2.3) DBSCAN clustering ---------------
|
||||||
|
|
||||||
library(dbscan)
|
library(dbscan)
|
||||||
d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
|
d1 <- dbscan(df, eps = 1, minPts = ncol(df) + 1)
|
||||||
hullplot(df, d1)
|
hullplot(df, d1)
|
||||||
|
|
||||||
grp_db <- d1$cluster
|
grp_db <- d1$cluster
|
||||||
@ -579,10 +392,10 @@ table(grp_db)
|
|||||||
|
|
||||||
kNNdistplot(df, k = ncol(df))
|
kNNdistplot(df, k = ncol(df))
|
||||||
abline(h = 0.2, col = "red")
|
abline(h = 0.2, col = "red")
|
||||||
abline(h = 0.06, col = "red")
|
abline(h = 1, col = "red")
|
||||||
|
|
||||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
||||||
#palette = mycols,
|
palette = mycols,
|
||||||
ellipse.type = "convex",
|
ellipse.type = "convex",
|
||||||
show.clust.cent = FALSE,
|
show.clust.cent = FALSE,
|
||||||
ggtheme = theme_bw())
|
ggtheme = theme_bw())
|
||||||
@ -595,17 +408,20 @@ rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
|||||||
|
|
||||||
aggregate(. ~ grp_db, df, mean)
|
aggregate(. ~ grp_db, df, mean)
|
||||||
|
|
||||||
table(dattree[grp_db == 0, "Pattern"])
|
table(datcase[grp_db == 0, "Pattern"])
|
||||||
table(dattree[grp_db == 1, "Pattern"])
|
table(datcase[grp_db == 1, "Pattern"])
|
||||||
table(dattree[grp_db == 2, "Pattern"])
|
table(datcase[grp_db == 2, "Pattern"])
|
||||||
table(dattree[grp_db == 3, "Pattern"])
|
table(datcase[grp_db == 3, "Pattern"])
|
||||||
|
table(datcase[grp_db == 4, "Pattern"])
|
||||||
|
|
||||||
### Look at selected cases ###########################################
|
### Look at selected cases ###########################################
|
||||||
|
dattree[grp_db == 0, ]
|
||||||
|
|
||||||
tmp <- dat
|
tmp <- dat
|
||||||
tmp$start <- tmp$date.start
|
tmp$start <- tmp$date.start
|
||||||
tmp$complete <- tmp$date.stop
|
tmp$complete <- tmp$date.stop
|
||||||
|
|
||||||
alog <- activitylog(tmp[tmp$case == 30418, ],
|
alog <- activitylog(tmp[tmp$case == 15, ],
|
||||||
case_id = "case",
|
case_id = "case",
|
||||||
activity_id = "item",
|
activity_id = "item",
|
||||||
resource_id = "path",
|
resource_id = "path",
|
||||||
@ -650,38 +466,23 @@ library(partykit)
|
|||||||
|
|
||||||
dattree_db <- dattree[grp_db != 0, -1]
|
dattree_db <- dattree[grp_db != 0, -1]
|
||||||
dattree_db$grp <- factor(grp_db[grp_db != 0])
|
dattree_db$grp <- factor(grp_db[grp_db != 0])
|
||||||
dattree_db$Pattern <- factor(dattree_db$Pattern)
|
|
||||||
|
|
||||||
c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
|
c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
|
||||||
plot(as.party(c1))
|
plot(as.party(c1))
|
||||||
|
|
||||||
c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
|
c2 <- rpart(as.factor(grp_hclust) ~ ., data = dattree[, -1], method = "class")
|
||||||
plot(as.party(c2))
|
plot(as.party(c2))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
|
|
||||||
Pattern, data = dattree_db, method = "class")
|
|
||||||
plot(as.party(c1))
|
|
||||||
|
|
||||||
# with conditional tree
|
# with conditional tree
|
||||||
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
|
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.05)
|
||||||
plot(c2)
|
plot(c2)
|
||||||
|
|
||||||
c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
|
||||||
PropMoves + Pattern, data = dattree_db, alpha = 0)
|
|
||||||
plot(c3)
|
|
||||||
|
|
||||||
c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
|
||||||
PropMoves + Pattern, data = dattree_db, alpha = 1)
|
|
||||||
plot(c4)
|
|
||||||
|
|
||||||
# with excluded points
|
# with excluded points
|
||||||
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
|
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0)
|
||||||
plot(c5)
|
plot(c5)
|
||||||
|
|
||||||
# with excluded points
|
# with excluded points
|
||||||
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
|
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 0)
|
||||||
plot(c6)
|
plot(c6)
|
||||||
# --> just checking
|
# --> just checking
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user