Played around with clustering; switched to glower distances and removed kmeans clustering
This commit is contained in:
parent
ea0660817a
commit
f8c1767074
@ -78,6 +78,8 @@ datcase <- datcase |>
|
||||
merge(eventtab, by = "case", all = TRUE) |>
|
||||
merge(topictab, by = "case", all = TRUE)
|
||||
|
||||
rm(eventtab, topictab)
|
||||
|
||||
datcase$ntopiccards <- aggregate(topic ~ case, dat,
|
||||
function(x) ifelse(all(is.na(x)), NA,
|
||||
length(na.omit(x))), na.action =
|
||||
@ -136,6 +138,7 @@ check_infocards <- function(subdata, artworks) {
|
||||
}
|
||||
as.numeric(infocard_only)
|
||||
}
|
||||
# TODO: Move to helper file
|
||||
|
||||
artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
|
||||
|
||||
@ -167,208 +170,37 @@ heatmap(cor_mat)
|
||||
normalize <- function(x) {
|
||||
(x - min(x)) / (max(x) - min(x))
|
||||
}
|
||||
|
||||
|
||||
#df <- as.data.frame(lapply(datcase[, -1], normalize))
|
||||
df <- as.data.frame(lapply(datcase[, -1], scale))
|
||||
#df <- datcase[, -1]
|
||||
|
||||
# "Flatten" with PCA
|
||||
pc <- prcomp(df)
|
||||
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
||||
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||
|
||||
plot(coor_2d)
|
||||
rgl::plot3d(coor_3d)
|
||||
|
||||
#--------------- (2.1) K-Means clustering ---------------
|
||||
|
||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||
|
||||
k1 <- kmeans(df, 4)
|
||||
|
||||
grp_km <- k1$cluster
|
||||
table(grp_km)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp_km),
|
||||
palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
plot(coor_2d, col = mycols[grp_km])
|
||||
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
||||
|
||||
aggregate(. ~ grp_km, df, mean)
|
||||
|
||||
#--------------- (2.2) Hierarchical clustering ---------------
|
||||
|
||||
mat <- dist(df)
|
||||
|
||||
h1 <- hclust(mat, method = "average")
|
||||
h2 <- hclust(mat, method = "complete")
|
||||
h3 <- hclust(mat, method = "ward.D")
|
||||
h4 <- hclust(mat, method = "ward.D2")
|
||||
h5 <- hclust(mat, method = "single")
|
||||
|
||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
||||
c1 <- cophenetic(h1)
|
||||
c2 <- cophenetic(h2)
|
||||
c3 <- cophenetic(h3)
|
||||
c4 <- cophenetic(h4)
|
||||
c5 <- cophenetic(h5)
|
||||
|
||||
# Correlations
|
||||
cor(mat, c1)
|
||||
cor(mat, c2)
|
||||
cor(mat, c3)
|
||||
cor(mat, c4)
|
||||
cor(mat, c5)
|
||||
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||
|
||||
hc <- h4
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||
|
||||
k <- 4
|
||||
|
||||
grp_hclust <- cutree(hc, k = k)
|
||||
|
||||
table(grp_hclust)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp_hclust),
|
||||
palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
plot(coor_2d, col = mycols[grp_hclust])
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
||||
|
||||
table(dattree[grp_hclust == 1, "Pattern"])
|
||||
table(dattree[grp_hclust == 2, "Pattern"])
|
||||
table(dattree[grp_hclust == 3, "Pattern"])
|
||||
table(dattree[grp_hclust == 4, "Pattern"])
|
||||
|
||||
|
||||
aggregate(. ~ grp_hclust, df, mean)
|
||||
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
||||
mean)
|
||||
|
||||
#--------------- (2.3) DBSCAN clustering ---------------
|
||||
|
||||
library(dbscan)
|
||||
d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
|
||||
hullplot(df, d1)
|
||||
|
||||
grp_db <- d1$cluster
|
||||
table(grp_db)
|
||||
|
||||
kNNdistplot(df, k = ncol(df))
|
||||
abline(h = 0.2, col = "red")
|
||||
abline(h = 0.06, col = "red")
|
||||
|
||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
||||
#palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
mycols <- c("black", mycols)
|
||||
|
||||
plot(coor_2d, col = mycols[grp_db + 1])
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
||||
|
||||
aggregate(. ~ grp_db, df, mean)
|
||||
|
||||
table(dattree[grp_db == 0, "Pattern"])
|
||||
table(dattree[grp_db == 1, "Pattern"])
|
||||
table(dattree[grp_db == 2, "Pattern"])
|
||||
table(dattree[grp_db == 3, "Pattern"])
|
||||
|
||||
# Does not really work with these features!
|
||||
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
# Navigation types by Bousbia et al. (2010):
|
||||
# - Overviewing: this value is close to the Canter “scanning” value. It
|
||||
# implies that the learner is covering a large proportion of course pages.
|
||||
# Through this phase of fast-reading, the user seeks to acquire an
|
||||
# overall view of the course.
|
||||
# - Flitting: close to “wandering”. It reflects a browsing activity without a
|
||||
# strategy or a particular goal. The main difference with the overviewing
|
||||
# type is the lack of focus on the course.
|
||||
# - Studying: corresponds to a partial or complete reading of the course
|
||||
# pages where the learner spends time on each page.
|
||||
# - Deepening: This describes a learner who spends relatively long time on a
|
||||
# course, checking details, and seeking Web documents related to the course
|
||||
# topics. The main difference with studying is the Web search part that the
|
||||
# learner uses to obtain a deeper understanding of the course.
|
||||
|
||||
# Taxonomy defined by Canter et al. (1985):
|
||||
# - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia)
|
||||
# by requesting an important proportion of its pages but without spending
|
||||
# much time on them.
|
||||
# - Browsing: going wherever the data leads the navigator until catching an
|
||||
# interest.
|
||||
# - Exploring: reading the viewed pages thoroughly.
|
||||
# - Searching: seeking for a particular document or information.
|
||||
# - Wandering: navigating in an unstructured fashion without any particular
|
||||
# goal or strategy.
|
||||
# TODO: Move to helper file
|
||||
|
||||
# Features for navigation types for MTT:
|
||||
# - Scanning / Overviewing:
|
||||
# * Proportion of artworks looked at is high: datcase$nitems / 70
|
||||
# * Proportion of artworks looked at is high
|
||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||
# - Exploring:
|
||||
# * Looking at additional information for most items touched (high value):
|
||||
# harmonic mean of datcase$nopenTopic / datcase$nflipCard and
|
||||
# datcase$nopenPopup / datcase$nopenTopic
|
||||
# * Looking at additional information is high
|
||||
# - Searching / Studying:
|
||||
# * Looking only at a few items
|
||||
# datcase$nitems / 70 is low
|
||||
# * Proportion of artworks looked at is low
|
||||
# * Opening few cards
|
||||
# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low
|
||||
# * but for most cards popups are opened:
|
||||
# datcase$nopenPopup / datcase$nflipCard is high
|
||||
# - Wandering / Flitting:
|
||||
# * Items are mostly just moved:
|
||||
# datcase$nmove / datcase$length is high
|
||||
# * Proportion of moves is high
|
||||
# * Duration per case is low:
|
||||
# datcase$duration / mean(datcase$duration) or median(datcase$duration)
|
||||
# * Duration per artwork is low: "ave_duration_item" / datcase$duration
|
||||
|
||||
dattree <- data.frame(case = datcase$case,
|
||||
#Duration = datcase$duration,
|
||||
PropItems = datcase$nitems / length(unique(dat$item)),
|
||||
#PropTopic = datcase$nopenTopic / datcase$nflipCard,
|
||||
#PropPopup = datcase$nopenPopup / datcase$nopenTopic,
|
||||
# SearchInfo =
|
||||
# 2*(((datcase$nopenPopup / datcase$nopenTopic) *
|
||||
# (datcase$nopenTopic / datcase$nflipCard)) /
|
||||
# ((datcase$nopenPopup / datcase$nopenTopic) +
|
||||
# (datcase$nopenTopic / datcase$nflipCard))
|
||||
# ),
|
||||
SearchInfo = datcase$nopenTopic / datcase$nflipCard +
|
||||
datcase$nopenPopup / datcase$nopenTopic,
|
||||
NumItems = datcase$nitems,
|
||||
NumTopic = datcase$nopenTopic,
|
||||
NumPopup = datcase$nopenPopup,
|
||||
PropMoves = datcase$nmove / datcase$length,
|
||||
PathLinearity = datcase$nitems / datcase$npaths,
|
||||
Singularity = datcase$npaths / datcase$length
|
||||
)
|
||||
|
||||
#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
|
||||
dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
|
||||
|
||||
#dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
|
||||
#dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
|
||||
dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
|
||||
dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
|
||||
|
||||
get_centrality <- function(case, data) {
|
||||
|
||||
@ -388,17 +220,16 @@ get_centrality <- function(case, data) {
|
||||
igraph::centr_degree(inet, loops = TRUE)$centralization,
|
||||
igraph::centr_betw(inet)$centralization)
|
||||
}
|
||||
# TODO: Move to helper file
|
||||
|
||||
# centrality <- lapply(dattree$case, get_centrality, data = dat)
|
||||
# centrality <- do.call(rbind, centrality)
|
||||
#
|
||||
# save(centrality, file = "results/haum/tmp_centrality.RData")
|
||||
load("results/haum/tmp_centrality.RData")
|
||||
|
||||
centrality <- lapply(dattree$case, get_centrality, data = dat)
|
||||
centrality <- do.call(rbind, centrality)
|
||||
|
||||
save(centrality, file = "results/haum/tmp_centrality.RData")
|
||||
|
||||
#dattree$centr_degree <- centrality[, 1]
|
||||
#dattree$centr_degree_loops <- centrality[, 2]
|
||||
dattree$DegreeCentrality <- centrality[, 2]
|
||||
#dattree$BetweenCentrality <- centrality[, 3]
|
||||
#dattree$DegreeCentrality <- centrality[, 2]
|
||||
dattree$BetweenCentrality <- centrality[, 3]
|
||||
|
||||
## Add average duration per item
|
||||
|
||||
@ -413,135 +244,118 @@ tmp$duration <- tmp$max_time - tmp$min_time
|
||||
tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case
|
||||
|
||||
dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
|
||||
#dattree$AvDurItem <- dattree$AvDurItem / datcase$duration
|
||||
|
||||
rm(tmp)
|
||||
|
||||
summary(dattree)
|
||||
|
||||
plot(dattree[, -1], pch = ".")
|
||||
|
||||
par(mfrow = c(3,4))
|
||||
hist(dattree$Duration, breaks = 50, main = "")
|
||||
par(mfrow = c(2,4))
|
||||
hist(dattree$AvDurItem, breaks = 50, main = "")
|
||||
hist(dattree$PropItems, breaks = 50, main = "")
|
||||
hist(dattree$PropTopic, breaks = 50, main = "")
|
||||
hist(dattree$PropPopup, breaks = 50, main = "")
|
||||
hist(dattree$NumItems, breaks = 50, main = "")
|
||||
hist(dattree$NumTopic, breaks = 50, main = "")
|
||||
hist(dattree$NumPopup, breaks = 50, main = "")
|
||||
hist(dattree$PropMoves, breaks = 50, main = "")
|
||||
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||
hist(dattree$Singularity, breaks = 50, main = "")
|
||||
hist(dattree$centr_degree, breaks = 50, main = "")
|
||||
hist(dattree$centr_degree_loops, breaks = 50, main = "")
|
||||
hist(dattree$centr_between, breaks = 50, main = "")
|
||||
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||
|
||||
# Indicator variable if table was used as info terminal only
|
||||
dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, labels = c("no", "yes"))
|
||||
|
||||
# Add pattern to datcase; loosely based on Bousbia et al. (2009)
|
||||
dattree$Pattern <- "Dispersion"
|
||||
dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar",
|
||||
dattree$Pattern)
|
||||
dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
||||
dattree$BetweenCentrality > 0.5, "Star",
|
||||
dattree$Pattern)
|
||||
dattree$Pattern <- factor(dattree$Pattern)
|
||||
|
||||
|
||||
cor_mat <- cor(dattree[, -1], use = "pairwise")
|
||||
diag(cor_mat) <- NA
|
||||
heatmap(cor_mat)
|
||||
# Remove cases with extreme outliers
|
||||
# TODO: Do I want this???
|
||||
|
||||
quantile(datcase$nopenTopic, 0.999)
|
||||
quantile(datcase$nopenPopup, 0.999)
|
||||
|
||||
# dattree$Pattern <- "Dispersion"
|
||||
# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
|
||||
# dattree$Singularity > 0.8, "Scholar",
|
||||
# dattree$Pattern)
|
||||
# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
|
||||
# dattree$centr_between > 0.5, "Star",
|
||||
# dattree$Pattern)
|
||||
dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ]
|
||||
|
||||
plot(dattree[, -1], pch = ".")
|
||||
|
||||
par(mfrow = c(2,4))
|
||||
hist(dattree$AvDurItem, breaks = 50, main = "")
|
||||
hist(dattree$NumItems, breaks = 50, main = "")
|
||||
hist(dattree$NumTopic, breaks = 50, main = "")
|
||||
hist(dattree$NumPopup, breaks = 50, main = "")
|
||||
hist(dattree$PropMoves, breaks = 50, main = "")
|
||||
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||
hist(dattree$Singularity, breaks = 50, main = "")
|
||||
hist(dattree$BetweenCentrality, breaks = 50, main = "")
|
||||
|
||||
write.table(dattree,
|
||||
file = "results/haum/dattree.csv",
|
||||
sep = ";",
|
||||
quote = FALSE,
|
||||
row.names = FALSE)
|
||||
|
||||
#--------------- (2) Clustering ---------------
|
||||
|
||||
df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
|
||||
#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
|
||||
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
||||
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
||||
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
||||
library(cluster)
|
||||
|
||||
# scale Duration and min/max SearchInfo
|
||||
#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
|
||||
df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
|
||||
(max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
|
||||
#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
||||
# (max(df$SearchInfo) - min(df$SearchInfo))
|
||||
df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
|
||||
(max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE))
|
||||
df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
|
||||
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
|
||||
df <- dattree[1:10000, -1] # remove case variable
|
||||
# TODO: Do I need to scale or does normalization also work?
|
||||
|
||||
# Normalize Duration and Numbers
|
||||
# df$AvDurItem <- normalize(df$AvDurItem)
|
||||
# df$NumItems <- normalize(df$NumItems)
|
||||
# df$NumTopic <- normalize(df$NumTopic)
|
||||
# df$NumPopup <- normalize(df$NumPopup)
|
||||
|
||||
# summary(df)
|
||||
|
||||
# Look at collinearity
|
||||
cor_mat <- cor(df)
|
||||
diag(cor_mat) <- NA
|
||||
heatmap(cor_mat)
|
||||
|
||||
#df <- as.data.frame(scale(dattree[, -1]))
|
||||
|
||||
|
||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
df <- dattree[, -1]
|
||||
df$AvDurItem <- normalize(df$AvDurItem)
|
||||
df$SearchInfo <- normalize(df$SearchInfo)
|
||||
df$InfocardOnly <- datcase$infocardOnly
|
||||
summary(df)
|
||||
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
#--------------- (2.2) Hierarchical clustering ---------------
|
||||
|
||||
# "Flatten" with PCA
|
||||
pc <- prcomp(df)
|
||||
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
||||
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||
mat <- daisy(df, metric = "gower")
|
||||
|
||||
# "Flatten" with MDS
|
||||
coor_2d <- as.data.frame(cmdscale(mat, k = 2))
|
||||
coor_3d <- as.data.frame(cmdscale(mat, k = 3))
|
||||
|
||||
plot(coor_2d)
|
||||
rgl::plot3d(coor_3d)
|
||||
|
||||
#--------------- (2.1) K-Means clustering ---------------
|
||||
#mat <- dist(df)
|
||||
|
||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||
# https://uc-r.github.io/hc_clustering
|
||||
method <- c(average = "average", single = "single", complete = "complete",
|
||||
ward = "ward.D2")
|
||||
|
||||
k1 <- kmeans(df, 4)
|
||||
hc_method <- function(x) {
|
||||
hclust(mat, method = x)
|
||||
}
|
||||
|
||||
grp_km <- k1$cluster
|
||||
table(grp_km)
|
||||
|
||||
fviz_cluster(list(data = df, cluster = grp_km),
|
||||
palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
|
||||
plot(coor_2d, col = mycols[grp_km])
|
||||
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
||||
|
||||
aggregate(. ~ grp_km, df, mean)
|
||||
|
||||
#--------------- (2.2) Hierarchical clustering ---------------
|
||||
|
||||
mat <- dist(df)
|
||||
|
||||
h1 <- hclust(mat, method = "average")
|
||||
h2 <- hclust(mat, method = "complete")
|
||||
h3 <- hclust(mat, method = "ward.D")
|
||||
h4 <- hclust(mat, method = "ward.D2")
|
||||
h5 <- hclust(mat, method = "single")
|
||||
|
||||
# Cophenetic Distances, for each linkage (runs quite some time!)
|
||||
c1 <- cophenetic(h1)
|
||||
c2 <- cophenetic(h2)
|
||||
c3 <- cophenetic(h3)
|
||||
c4 <- cophenetic(h4)
|
||||
c5 <- cophenetic(h5)
|
||||
|
||||
# Correlations
|
||||
cor(mat, c1)
|
||||
cor(mat, c2)
|
||||
cor(mat, c3)
|
||||
cor(mat, c4)
|
||||
cor(mat, c5)
|
||||
hcs <- lapply(method, hc_method)
|
||||
cds <- lapply(hcs, cophenetic)
|
||||
cors <- sapply(cds, cor, y = mat)
|
||||
|
||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||
|
||||
hc <- h4
|
||||
hc <- hcs$average
|
||||
|
||||
# Something like a scree plot (??)
|
||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||
|
||||
k <- 4
|
||||
|
||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||
|
||||
grp_hclust <- cutree(hc, k = k)
|
||||
|
||||
table(grp_hclust)
|
||||
@ -553,17 +367,16 @@ fviz_cluster(list(data = df, cluster = grp_hclust),
|
||||
ggtheme = theme_bw())
|
||||
|
||||
plot(coor_2d, col = mycols[grp_hclust])
|
||||
legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21)
|
||||
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
||||
|
||||
table(dattree[grp_hclust == 1, "Pattern"])
|
||||
table(dattree[grp_hclust == 2, "Pattern"])
|
||||
table(dattree[grp_hclust == 3, "Pattern"])
|
||||
table(dattree[grp_hclust == 4, "Pattern"])
|
||||
|
||||
table(datcase[grp_hclust == 1, "Pattern"])
|
||||
table(datcase[grp_hclust == 2, "Pattern"])
|
||||
table(datcase[grp_hclust == 3, "Pattern"])
|
||||
table(datcase[grp_hclust == 4, "Pattern"])
|
||||
|
||||
aggregate(. ~ grp_hclust, df, mean)
|
||||
|
||||
|
||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
||||
mean)
|
||||
@ -571,7 +384,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||
#--------------- (2.3) DBSCAN clustering ---------------
|
||||
|
||||
library(dbscan)
|
||||
d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
|
||||
d1 <- dbscan(df, eps = 1, minPts = ncol(df) + 1)
|
||||
hullplot(df, d1)
|
||||
|
||||
grp_db <- d1$cluster
|
||||
@ -579,10 +392,10 @@ table(grp_db)
|
||||
|
||||
kNNdistplot(df, k = ncol(df))
|
||||
abline(h = 0.2, col = "red")
|
||||
abline(h = 0.06, col = "red")
|
||||
abline(h = 1, col = "red")
|
||||
|
||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
||||
#palette = mycols,
|
||||
palette = mycols,
|
||||
ellipse.type = "convex",
|
||||
show.clust.cent = FALSE,
|
||||
ggtheme = theme_bw())
|
||||
@ -595,17 +408,20 @@ rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
||||
|
||||
aggregate(. ~ grp_db, df, mean)
|
||||
|
||||
table(dattree[grp_db == 0, "Pattern"])
|
||||
table(dattree[grp_db == 1, "Pattern"])
|
||||
table(dattree[grp_db == 2, "Pattern"])
|
||||
table(dattree[grp_db == 3, "Pattern"])
|
||||
table(datcase[grp_db == 0, "Pattern"])
|
||||
table(datcase[grp_db == 1, "Pattern"])
|
||||
table(datcase[grp_db == 2, "Pattern"])
|
||||
table(datcase[grp_db == 3, "Pattern"])
|
||||
table(datcase[grp_db == 4, "Pattern"])
|
||||
|
||||
### Look at selected cases ###########################################
|
||||
dattree[grp_db == 0, ]
|
||||
|
||||
tmp <- dat
|
||||
tmp$start <- tmp$date.start
|
||||
tmp$complete <- tmp$date.stop
|
||||
|
||||
alog <- activitylog(tmp[tmp$case == 30418, ],
|
||||
alog <- activitylog(tmp[tmp$case == 15, ],
|
||||
case_id = "case",
|
||||
activity_id = "item",
|
||||
resource_id = "path",
|
||||
@ -650,38 +466,23 @@ library(partykit)
|
||||
|
||||
dattree_db <- dattree[grp_db != 0, -1]
|
||||
dattree_db$grp <- factor(grp_db[grp_db != 0])
|
||||
dattree_db$Pattern <- factor(dattree_db$Pattern)
|
||||
|
||||
c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
|
||||
plot(as.party(c1))
|
||||
|
||||
c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
|
||||
c2 <- rpart(as.factor(grp_hclust) ~ ., data = dattree[, -1], method = "class")
|
||||
plot(as.party(c2))
|
||||
|
||||
|
||||
|
||||
c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
|
||||
Pattern, data = dattree_db, method = "class")
|
||||
plot(as.party(c1))
|
||||
|
||||
# with conditional tree
|
||||
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
|
||||
c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.05)
|
||||
plot(c2)
|
||||
|
||||
c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||
PropMoves + Pattern, data = dattree_db, alpha = 0)
|
||||
plot(c3)
|
||||
|
||||
c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||
PropMoves + Pattern, data = dattree_db, alpha = 1)
|
||||
plot(c4)
|
||||
|
||||
# with excluded points
|
||||
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
|
||||
c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0)
|
||||
plot(c5)
|
||||
|
||||
# with excluded points
|
||||
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
|
||||
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 0)
|
||||
plot(c6)
|
||||
# --> just checking
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user