diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index dc66bff..0f03a8d 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -247,14 +247,14 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration rm(tmp) +plot(dattree) - - - -par(mfrow = c(3,3)) +par(mfrow = c(3,4)) hist(dattree$Duration, breaks = 50, main = "") -hist(dattree$SearchInfo, breaks = 50, main = "") +hist(dattree$AvDurItem, breaks = 50, main = "") hist(dattree$PropItems, breaks = 50, main = "") +hist(dattree$PropTopic, breaks = 50, main = "") +hist(dattree$PropPopup, breaks = 50, main = "") hist(dattree$PropMoves, breaks = 50, main = "") hist(dattree$PathLinearity, breaks = 50, main = "") hist(dattree$Singularity, breaks = 50, main = "") @@ -286,13 +286,14 @@ write.table(dattree, df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")] #df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")] -# TODO: With or without duration? Why is it relevant? - df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) + # scale Duration and min/max SearchInfo -df$AvDurItem <- as.numeric(scale(df$AvDurItem)) +#df$AvDurItem <- as.numeric(scale(df$AvDurItem)) +df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) / + (max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE)) #df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / # (max(df$SearchInfo) - min(df$SearchInfo)) df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) / @@ -300,6 +301,33 @@ df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) / df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) / (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE)) + +# "Flatten" with PCA +pc <- prcomp(df) +coor_2d <- as.data.frame(pc$x[, c(1, 2)]) +coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)]) + +#--------------- (2.1) K-Means clustering --------------- + +mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") + +k1 <- kmeans(df, 4) + +grp_km <- k1$cluster +table(grp_km) + +fviz_cluster(list(data = df, cluster = grp_km), + palette = mycols, + ellipse.type = "convex", + show.clust.cent = FALSE, + ggtheme = theme_bw()) + +plot(coor_2d, col = mycols[grp_km]) + +rgl::plot3d(coor_3d, col = mycols[grp_km]) + +#--------------- (2.2) Hierarchical clustering --------------- + mat <- dist(df) # TODO: Do I need to scale all variables? @@ -326,7 +354,7 @@ cor(mat, c5) # https://en.wikipedia.org/wiki/Cophenetic_correlation # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering -hc <- h4 +hc <- h1 # Something like a scree plot (??) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) @@ -338,50 +366,46 @@ grp_hclust <- cutree(hc, k = k) table(grp_hclust) fviz_cluster(list(data = df, cluster = grp_hclust), - palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"), + palette = mycols, ellipse.type = "convex", show.clust.cent = FALSE, ggtheme = theme_bw()) +plot(coor_2d, col = mycols[grp_hclust]) +rgl::plot3d(coor_3d, col = mycols[grp_hclust]) + table(dattree[grp_hclust == 1, "Pattern"]) table(dattree[grp_hclust == 2, "Pattern"]) table(dattree[grp_hclust == 3, "Pattern"]) table(dattree[grp_hclust == 4, "Pattern"]) -# Look at 3d plot to see if clusters are actually separate -pc <- prcomp(df) -coor <- as.data.frame(pc$x[, c(1, 2, 3)]) -rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_hclust]) +aggregate(. ~ grp_hclust, df, mean) -aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, - Singularity, centr_degree, centr_degree_loops, - centr_between) ~ grp_hclust, dattree, mean) - aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase, mean) -### DBSCAN clustering +#--------------- (2.3) DBSCAN clustering --------------- library(dbscan) -d1 <- dbscan(df, eps = .5, minPts = 9) +d1 <- dbscan(df, eps = .2, minPts = 9) hullplot(df, d1) grp_db <- d1$cluster table(grp_db) -kNNdistplot(df, k = 6) -abline(h = 0.5, col = "red") +kNNdistplot(df, k = ncol(df)) +abline(h = 0.2, col = "red") fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), - palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"), + palette = mycols, ellipse.type = "convex", show.clust.cent = FALSE, ggtheme = theme_bw()) -rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_db + 1]) +rgl::plot3d(coor_3d, col = mycols[grp_db + 1]) aggregate(. ~ grp_db, df, mean) @@ -390,28 +414,12 @@ table(dattree[grp_db == 1, "Pattern"]) table(dattree[grp_db == 2, "Pattern"]) table(dattree[grp_db == 3, "Pattern"]) - -### K-Means clustering - -k1 <- kmeans(df, 4) - -grp_km <- k1$cluster -table(grp_km) - -fviz_cluster(list(data = df, cluster = grp_km), - palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"), - ellipse.type = "convex", - show.clust.cent = FALSE, - ggtheme = theme_bw()) - -rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_km]) - ### Look at selected cases ########################################### -tmp <- res +tmp <- dat tmp$start <- tmp$date.start tmp$complete <- tmp$date.stop -alog <- activitylog(tmp[tmp$case == 30855, ], +alog <- activitylog(tmp[tmp$case == 30418, ], case_id = "case", activity_id = "item", resource_id = "path", @@ -419,25 +427,26 @@ alog <- activitylog(tmp[tmp$case == 30855, ], process_map(alog) +rm(tmp) + ###################################################################### -res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE) +res <- merge(dat, data.frame(case = dattree$case, grp_km, grp_hclust, grp_db), + by = "case", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] -rm(dat) - -xtabs( ~ item + grp, res) -aggregate(event ~ grp, res, table) +xtabs( ~ item + grp_db, res) +aggregate(event ~ grp_db, res, table) # Look at clusters par(mfrow = c(2, 2)) -vioplot::vioplot(duration ~ grp, res) -vioplot::vioplot(distance ~ grp, res) -vioplot::vioplot(scaleSize ~ grp, res) -vioplot::vioplot(rotationDegree ~ grp, res) +vioplot::vioplot(duration ~ grp_db, res) +vioplot::vioplot(distance ~ grp_db, res) +vioplot::vioplot(scaleSize ~ grp_db, res) +vioplot::vioplot(rotationDegree ~ grp_db, res) -aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean) -aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median) +aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp_db, res, mean) +aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp_db, res, median) write.table(res, file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv", @@ -453,66 +462,36 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df, library(rpart) library(partykit) -## dbscan - dattree_db <- dattree[grp_db != 0, ] - dattree_db$grp <- factor(grp_db[grp_db != 0]) dattree_db$Pattern <- factor(dattree_db$Pattern) -c1 <- rpart(grp ~ AvDurItem + PropItems + SearchInfo + PropMoves + +c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves + Pattern, data = dattree_db, method = "class") - -c1 <- rpart(grp_db ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves + - Pattern, data = dattree, method = "class") - - plot(as.party(c1)) +# with conditional tree +c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + + PropMoves + Pattern, data = dattree_db, alpha = 0.5) +plot(c2) -c1a <- rpart(grp_db ~ AvDurItem + PropItems + SearchInfo + PropMoves + - Pattern, data = dattree, method = "class") - -plot(as.party(c1a)) - - -c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern, - data = dattree_db, method = "class") - -plot(as.party(c2)) - -# with conditional tree function -c3 <- ctree(as.factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup + - PropMoves + as.factor(Pattern), data = dattree, alpha = 1) +c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + + PropMoves + Pattern, data = dattree_db, alpha = 0) plot(c3) -cluster <- as.factor(grp_db[grp_db != 0]) - -c4 <- ctree(cluster ~ nmove + nflipCard + nopenTopic + nopenPopup, - data = datcase[grp_db != 0, ], alpha = .001) +c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + + PropMoves + Pattern, data = dattree_db, alpha = 1) plot(c4) - -c5 <- ctree(cluster ~ duration, - data = datcase[grp_db != 0, ], alpha = .001) +# with excluded points +c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup + + PropMoves + factor(Pattern), data = dattree, alpha = 1) plot(c5) -## hclust - -c1 <- rpart(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + PropMoves + - Pattern, data = dattree, method = "class") - -plot(as.party(c1)) - -c3 <- ctree(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + - PropMoves + as.factor(Pattern), data = dattree, alpha = 0) -plot(c3) - -c4 <- ctree(as.factor(grp_hclust) ~ nmove + nflipCard + nopenTopic + nopenPopup, - data = datcase, alpha = .001) -plot(c4) - - +# with excluded points +c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1) +plot(c6) +# --> just checking #--------------- (4) Investigate variants ---------------