Cleaned up, tried out some stuff with clustering; Pattern variable probably has to go
This commit is contained in:
parent
b3bc81ccbc
commit
72d2b6b799
@ -247,14 +247,14 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
|
|||||||
|
|
||||||
rm(tmp)
|
rm(tmp)
|
||||||
|
|
||||||
|
plot(dattree)
|
||||||
|
|
||||||
|
par(mfrow = c(3,4))
|
||||||
|
|
||||||
|
|
||||||
par(mfrow = c(3,3))
|
|
||||||
hist(dattree$Duration, breaks = 50, main = "")
|
hist(dattree$Duration, breaks = 50, main = "")
|
||||||
hist(dattree$SearchInfo, breaks = 50, main = "")
|
hist(dattree$AvDurItem, breaks = 50, main = "")
|
||||||
hist(dattree$PropItems, breaks = 50, main = "")
|
hist(dattree$PropItems, breaks = 50, main = "")
|
||||||
|
hist(dattree$PropTopic, breaks = 50, main = "")
|
||||||
|
hist(dattree$PropPopup, breaks = 50, main = "")
|
||||||
hist(dattree$PropMoves, breaks = 50, main = "")
|
hist(dattree$PropMoves, breaks = 50, main = "")
|
||||||
hist(dattree$PathLinearity, breaks = 50, main = "")
|
hist(dattree$PathLinearity, breaks = 50, main = "")
|
||||||
hist(dattree$Singularity, breaks = 50, main = "")
|
hist(dattree$Singularity, breaks = 50, main = "")
|
||||||
@ -286,13 +286,14 @@ write.table(dattree,
|
|||||||
|
|
||||||
df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
|
df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
|
||||||
#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
|
#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
|
||||||
# TODO: With or without duration? Why is it relevant?
|
|
||||||
|
|
||||||
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
|
||||||
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
|
||||||
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
|
||||||
|
|
||||||
# scale Duration and min/max SearchInfo
|
# scale Duration and min/max SearchInfo
|
||||||
df$AvDurItem <- as.numeric(scale(df$AvDurItem))
|
#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
|
||||||
|
df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
|
||||||
|
(max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
|
||||||
#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
|
||||||
# (max(df$SearchInfo) - min(df$SearchInfo))
|
# (max(df$SearchInfo) - min(df$SearchInfo))
|
||||||
df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
|
df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
|
||||||
@ -300,6 +301,33 @@ df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
|
|||||||
df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
|
df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
|
||||||
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
|
(max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
|
||||||
|
|
||||||
|
|
||||||
|
# "Flatten" with PCA
|
||||||
|
pc <- prcomp(df)
|
||||||
|
coor_2d <- as.data.frame(pc$x[, c(1, 2)])
|
||||||
|
coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
|
||||||
|
|
||||||
|
#--------------- (2.1) K-Means clustering ---------------
|
||||||
|
|
||||||
|
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
||||||
|
|
||||||
|
k1 <- kmeans(df, 4)
|
||||||
|
|
||||||
|
grp_km <- k1$cluster
|
||||||
|
table(grp_km)
|
||||||
|
|
||||||
|
fviz_cluster(list(data = df, cluster = grp_km),
|
||||||
|
palette = mycols,
|
||||||
|
ellipse.type = "convex",
|
||||||
|
show.clust.cent = FALSE,
|
||||||
|
ggtheme = theme_bw())
|
||||||
|
|
||||||
|
plot(coor_2d, col = mycols[grp_km])
|
||||||
|
|
||||||
|
rgl::plot3d(coor_3d, col = mycols[grp_km])
|
||||||
|
|
||||||
|
#--------------- (2.2) Hierarchical clustering ---------------
|
||||||
|
|
||||||
mat <- dist(df)
|
mat <- dist(df)
|
||||||
# TODO: Do I need to scale all variables?
|
# TODO: Do I need to scale all variables?
|
||||||
|
|
||||||
@ -326,7 +354,7 @@ cor(mat, c5)
|
|||||||
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
||||||
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
||||||
|
|
||||||
hc <- h4
|
hc <- h1
|
||||||
|
|
||||||
# Something like a scree plot (??)
|
# Something like a scree plot (??)
|
||||||
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
|
||||||
@ -338,50 +366,46 @@ grp_hclust <- cutree(hc, k = k)
|
|||||||
table(grp_hclust)
|
table(grp_hclust)
|
||||||
|
|
||||||
fviz_cluster(list(data = df, cluster = grp_hclust),
|
fviz_cluster(list(data = df, cluster = grp_hclust),
|
||||||
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
|
palette = mycols,
|
||||||
ellipse.type = "convex",
|
ellipse.type = "convex",
|
||||||
show.clust.cent = FALSE,
|
show.clust.cent = FALSE,
|
||||||
ggtheme = theme_bw())
|
ggtheme = theme_bw())
|
||||||
|
|
||||||
|
plot(coor_2d, col = mycols[grp_hclust])
|
||||||
|
rgl::plot3d(coor_3d, col = mycols[grp_hclust])
|
||||||
|
|
||||||
table(dattree[grp_hclust == 1, "Pattern"])
|
table(dattree[grp_hclust == 1, "Pattern"])
|
||||||
table(dattree[grp_hclust == 2, "Pattern"])
|
table(dattree[grp_hclust == 2, "Pattern"])
|
||||||
table(dattree[grp_hclust == 3, "Pattern"])
|
table(dattree[grp_hclust == 3, "Pattern"])
|
||||||
table(dattree[grp_hclust == 4, "Pattern"])
|
table(dattree[grp_hclust == 4, "Pattern"])
|
||||||
|
|
||||||
# Look at 3d plot to see if clusters are actually separate
|
|
||||||
pc <- prcomp(df)
|
|
||||||
coor <- as.data.frame(pc$x[, c(1, 2, 3)])
|
|
||||||
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_hclust])
|
|
||||||
|
|
||||||
|
aggregate(. ~ grp_hclust, df, mean)
|
||||||
|
|
||||||
|
|
||||||
aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
|
|
||||||
Singularity, centr_degree, centr_degree_loops,
|
|
||||||
centr_between) ~ grp_hclust, dattree, mean)
|
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
|
||||||
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
|
||||||
mean)
|
mean)
|
||||||
|
|
||||||
### DBSCAN clustering
|
#--------------- (2.3) DBSCAN clustering ---------------
|
||||||
|
|
||||||
library(dbscan)
|
library(dbscan)
|
||||||
d1 <- dbscan(df, eps = .5, minPts = 9)
|
d1 <- dbscan(df, eps = .2, minPts = 9)
|
||||||
hullplot(df, d1)
|
hullplot(df, d1)
|
||||||
|
|
||||||
grp_db <- d1$cluster
|
grp_db <- d1$cluster
|
||||||
table(grp_db)
|
table(grp_db)
|
||||||
|
|
||||||
kNNdistplot(df, k = 6)
|
kNNdistplot(df, k = ncol(df))
|
||||||
abline(h = 0.5, col = "red")
|
abline(h = 0.2, col = "red")
|
||||||
|
|
||||||
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
|
||||||
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"),
|
palette = mycols,
|
||||||
ellipse.type = "convex",
|
ellipse.type = "convex",
|
||||||
show.clust.cent = FALSE,
|
show.clust.cent = FALSE,
|
||||||
ggtheme = theme_bw())
|
ggtheme = theme_bw())
|
||||||
|
|
||||||
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_db + 1])
|
rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
|
||||||
|
|
||||||
aggregate(. ~ grp_db, df, mean)
|
aggregate(. ~ grp_db, df, mean)
|
||||||
|
|
||||||
@ -390,28 +414,12 @@ table(dattree[grp_db == 1, "Pattern"])
|
|||||||
table(dattree[grp_db == 2, "Pattern"])
|
table(dattree[grp_db == 2, "Pattern"])
|
||||||
table(dattree[grp_db == 3, "Pattern"])
|
table(dattree[grp_db == 3, "Pattern"])
|
||||||
|
|
||||||
|
|
||||||
### K-Means clustering
|
|
||||||
|
|
||||||
k1 <- kmeans(df, 4)
|
|
||||||
|
|
||||||
grp_km <- k1$cluster
|
|
||||||
table(grp_km)
|
|
||||||
|
|
||||||
fviz_cluster(list(data = df, cluster = grp_km),
|
|
||||||
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"),
|
|
||||||
ellipse.type = "convex",
|
|
||||||
show.clust.cent = FALSE,
|
|
||||||
ggtheme = theme_bw())
|
|
||||||
|
|
||||||
rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_km])
|
|
||||||
|
|
||||||
### Look at selected cases ###########################################
|
### Look at selected cases ###########################################
|
||||||
tmp <- res
|
tmp <- dat
|
||||||
tmp$start <- tmp$date.start
|
tmp$start <- tmp$date.start
|
||||||
tmp$complete <- tmp$date.stop
|
tmp$complete <- tmp$date.stop
|
||||||
|
|
||||||
alog <- activitylog(tmp[tmp$case == 30855, ],
|
alog <- activitylog(tmp[tmp$case == 30418, ],
|
||||||
case_id = "case",
|
case_id = "case",
|
||||||
activity_id = "item",
|
activity_id = "item",
|
||||||
resource_id = "path",
|
resource_id = "path",
|
||||||
@ -419,25 +427,26 @@ alog <- activitylog(tmp[tmp$case == 30855, ],
|
|||||||
|
|
||||||
process_map(alog)
|
process_map(alog)
|
||||||
|
|
||||||
|
rm(tmp)
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
|
res <- merge(dat, data.frame(case = dattree$case, grp_km, grp_hclust, grp_db),
|
||||||
|
by = "case", all.x = TRUE)
|
||||||
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||||
|
|
||||||
rm(dat)
|
xtabs( ~ item + grp_db, res)
|
||||||
|
aggregate(event ~ grp_db, res, table)
|
||||||
xtabs( ~ item + grp, res)
|
|
||||||
aggregate(event ~ grp, res, table)
|
|
||||||
|
|
||||||
# Look at clusters
|
# Look at clusters
|
||||||
par(mfrow = c(2, 2))
|
par(mfrow = c(2, 2))
|
||||||
vioplot::vioplot(duration ~ grp, res)
|
vioplot::vioplot(duration ~ grp_db, res)
|
||||||
vioplot::vioplot(distance ~ grp, res)
|
vioplot::vioplot(distance ~ grp_db, res)
|
||||||
vioplot::vioplot(scaleSize ~ grp, res)
|
vioplot::vioplot(scaleSize ~ grp_db, res)
|
||||||
vioplot::vioplot(rotationDegree ~ grp, res)
|
vioplot::vioplot(rotationDegree ~ grp_db, res)
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp_db, res, mean)
|
||||||
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp_db, res, median)
|
||||||
|
|
||||||
write.table(res,
|
write.table(res,
|
||||||
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
|
file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
|
||||||
@ -453,66 +462,36 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
|
|||||||
library(rpart)
|
library(rpart)
|
||||||
library(partykit)
|
library(partykit)
|
||||||
|
|
||||||
## dbscan
|
|
||||||
|
|
||||||
dattree_db <- dattree[grp_db != 0, ]
|
dattree_db <- dattree[grp_db != 0, ]
|
||||||
|
|
||||||
dattree_db$grp <- factor(grp_db[grp_db != 0])
|
dattree_db$grp <- factor(grp_db[grp_db != 0])
|
||||||
dattree_db$Pattern <- factor(dattree_db$Pattern)
|
dattree_db$Pattern <- factor(dattree_db$Pattern)
|
||||||
|
|
||||||
c1 <- rpart(grp ~ AvDurItem + PropItems + SearchInfo + PropMoves +
|
c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
|
||||||
Pattern, data = dattree_db, method = "class")
|
Pattern, data = dattree_db, method = "class")
|
||||||
|
|
||||||
c1 <- rpart(grp_db ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
|
|
||||||
Pattern, data = dattree, method = "class")
|
|
||||||
|
|
||||||
|
|
||||||
plot(as.party(c1))
|
plot(as.party(c1))
|
||||||
|
|
||||||
|
# with conditional tree
|
||||||
|
c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||||
|
PropMoves + Pattern, data = dattree_db, alpha = 0.5)
|
||||||
|
plot(c2)
|
||||||
|
|
||||||
c1a <- rpart(grp_db ~ AvDurItem + PropItems + SearchInfo + PropMoves +
|
c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||||
Pattern, data = dattree, method = "class")
|
PropMoves + Pattern, data = dattree_db, alpha = 0)
|
||||||
|
|
||||||
plot(as.party(c1a))
|
|
||||||
|
|
||||||
|
|
||||||
c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
|
|
||||||
data = dattree_db, method = "class")
|
|
||||||
|
|
||||||
plot(as.party(c2))
|
|
||||||
|
|
||||||
# with conditional tree function
|
|
||||||
c3 <- ctree(as.factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
|
||||||
PropMoves + as.factor(Pattern), data = dattree, alpha = 1)
|
|
||||||
plot(c3)
|
plot(c3)
|
||||||
|
|
||||||
cluster <- as.factor(grp_db[grp_db != 0])
|
c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||||
|
PropMoves + Pattern, data = dattree_db, alpha = 1)
|
||||||
c4 <- ctree(cluster ~ nmove + nflipCard + nopenTopic + nopenPopup,
|
|
||||||
data = datcase[grp_db != 0, ], alpha = .001)
|
|
||||||
plot(c4)
|
plot(c4)
|
||||||
|
|
||||||
|
# with excluded points
|
||||||
c5 <- ctree(cluster ~ duration,
|
c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
|
||||||
data = datcase[grp_db != 0, ], alpha = .001)
|
PropMoves + factor(Pattern), data = dattree, alpha = 1)
|
||||||
plot(c5)
|
plot(c5)
|
||||||
|
|
||||||
## hclust
|
# with excluded points
|
||||||
|
c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
|
||||||
c1 <- rpart(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + PropMoves +
|
plot(c6)
|
||||||
Pattern, data = dattree, method = "class")
|
# --> just checking
|
||||||
|
|
||||||
plot(as.party(c1))
|
|
||||||
|
|
||||||
c3 <- ctree(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo +
|
|
||||||
PropMoves + as.factor(Pattern), data = dattree, alpha = 0)
|
|
||||||
plot(c3)
|
|
||||||
|
|
||||||
c4 <- ctree(as.factor(grp_hclust) ~ nmove + nflipCard + nopenTopic + nopenPopup,
|
|
||||||
data = datcase, alpha = .001)
|
|
||||||
plot(c4)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#--------------- (4) Investigate variants ---------------
|
#--------------- (4) Investigate variants ---------------
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user