Played around with clustering; switched to glower distances and removed kmeans clustering

2024-03-05 16:25:49 +01:00 · 2024-03-05 16:25:49 +01:00 · f8c1767074
commit f8c1767074
parent ea0660817a
1 changed files with 117 additions and 316 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -74,10 +74,12 @@ topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6]
 topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7]
 topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8]
-datcase <- datcase |> 
+datcase <- datcase |>
  merge(eventtab, by = "case", all = TRUE) |>
  merge(topictab, by = "case", all = TRUE)
 rm(eventtab, topictab)
 datcase$ntopiccards <- aggregate(topic ~ case, dat,
                             function(x) ifelse(all(is.na(x)), NA,
                             length(na.omit(x))), na.action =
@ -136,6 +138,7 @@ check_infocards <- function(subdata, artworks) {
  }
  as.numeric(infocard_only)
 }
 # TODO: Move to helper file
 artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")]
@ -167,214 +170,43 @@ heatmap(cor_mat)
 normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
 }
-
+# TODO: Move to helper file
 #df <- as.data.frame(lapply(datcase[, -1], normalize))
 df <- as.data.frame(lapply(datcase[, -1], scale))
 #df <- datcase[, -1]
 # "Flatten" with PCA
 pc <- prcomp(df)
 coor_2d <- as.data.frame(pc$x[, c(1, 2)])
 coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
 plot(coor_2d)
 rgl::plot3d(coor_3d)
 #--------------- (2.1) K-Means clustering ---------------
 mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
 k1 <- kmeans(df, 4)
 grp_km <- k1$cluster
 table(grp_km)
 fviz_cluster(list(data = df, cluster = grp_km),
             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 plot(coor_2d, col = mycols[grp_km])
 rgl::plot3d(coor_3d, col = mycols[grp_km])
 aggregate(. ~ grp_km, df, mean)
 #--------------- (2.2) Hierarchical clustering ---------------
 mat <- dist(df)
 h1 <- hclust(mat, method = "average")
 h2 <- hclust(mat, method = "complete")
 h3 <- hclust(mat, method = "ward.D")
 h4 <- hclust(mat, method = "ward.D2")
 h5 <- hclust(mat, method = "single")
 # Cophenetic Distances, for each linkage (runs quite some time!)
 c1 <- cophenetic(h1)
 c2 <- cophenetic(h2)
 c3 <- cophenetic(h3)
 c4 <- cophenetic(h4)
 c5 <- cophenetic(h5)
 # Correlations
 cor(mat, c1)
 cor(mat, c2)
 cor(mat, c3)
 cor(mat, c4)
 cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
 hc <- h4
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
 k <- 4
 grp_hclust <- cutree(hc, k = k)
 table(grp_hclust)
 fviz_cluster(list(data = df, cluster = grp_hclust),
             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 plot(coor_2d, col = mycols[grp_hclust])
 rgl::plot3d(coor_3d, col = mycols[grp_hclust])
 table(dattree[grp_hclust == 1, "Pattern"])
 table(dattree[grp_hclust == 2, "Pattern"])
 table(dattree[grp_hclust == 3, "Pattern"])
 table(dattree[grp_hclust == 4, "Pattern"])
 aggregate(. ~ grp_hclust, df, mean)
 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
          mean)
 #--------------- (2.3) DBSCAN clustering ---------------
 library(dbscan)
 d1 <- dbscan(df, eps = .15, minPts = ncol(df) + 1)
 hullplot(df, d1)
 grp_db <- d1$cluster
 table(grp_db)
 kNNdistplot(df, k = ncol(df))
 abline(h = 0.2, col = "red")
 abline(h = 0.06, col = "red")
 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
             #palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 mycols <- c("black", mycols)
 plot(coor_2d, col = mycols[grp_db + 1])
 rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
 aggregate(. ~ grp_db, df, mean)
 table(dattree[grp_db == 0, "Pattern"])
 table(dattree[grp_db == 1, "Pattern"])
 table(dattree[grp_db == 2, "Pattern"])
 table(dattree[grp_db == 3, "Pattern"])
 # Does not really work with these features!
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # Navigation types by Bousbia et al. (2010):
 # - Overviewing: this value is close to the Canter “scanning” value. It
 #   implies that the learner is covering a large proportion of course pages.
 #   Through this phase of fast-reading, the user seeks to acquire an
 #   overall view of the course.
 # - Flitting: close to “wandering”. It reflects a browsing activity without a
 #   strategy or a particular goal. The main difference with the overviewing
 #   type is the lack of focus on the course.
 # - Studying: corresponds to a partial or complete reading of the course
 #   pages where the learner spends time on each page.
 # - Deepening: This describes a learner who spends relatively long time on a
 #   course, checking details, and seeking Web documents related to the course
 #   topics. The main difference with studying is the Web search part that the
 #   learner uses to obtain a deeper understanding of the course.
 # Taxonomy defined by Canter et al. (1985):
 # - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia)
 #   by requesting an important proportion of its pages but without spending
 #   much time on them.
 # - Browsing: going wherever the data leads the navigator until catching an
 #   interest.
 # - Exploring: reading the viewed pages thoroughly.
 # - Searching: seeking for a particular document or information.
 # - Wandering: navigating in an unstructured fashion without any particular
 #   goal or strategy.
 # Features for navigation types for MTT:
 # - Scanning / Overviewing:
-#   * Proportion of artworks looked at is high: datcase$nitems / 70
+#   * Proportion of artworks looked at is high
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 # - Exploring:
-#   * Looking at additional information for most items touched (high value):
+#   * Looking at additional information is high
 #   harmonic mean of datcase$nopenTopic / datcase$nflipCard and
 #   datcase$nopenPopup / datcase$nopenTopic
 # - Searching / Studying:
-#   * Looking only at a few items
+#   * Proportion of artworks looked at is low
 #   datcase$nitems / 70 is low
 #   * Opening few cards
 #   datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low
 #   * but for most cards popups are opened:
 #   datcase$nopenPopup / datcase$nflipCard is high
 # - Wandering / Flitting:
-#   * Items are mostly just moved:
+#   * Proportion of moves is high
 #   datcase$nmove / datcase$length is high
 #   * Duration per case is low:
 #   datcase$duration / mean(datcase$duration) or median(datcase$duration)
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 dattree <- data.frame(case = datcase$case,
-                      #Duration = datcase$duration,
+                      NumItems = datcase$nitems,
-                      PropItems = datcase$nitems / length(unique(dat$item)),
+                      NumTopic = datcase$nopenTopic,
-                      #PropTopic = datcase$nopenTopic / datcase$nflipCard,
+                      NumPopup = datcase$nopenPopup,
                      #PropPopup = datcase$nopenPopup / datcase$nopenTopic,
                      # SearchInfo = 
                      #   2*(((datcase$nopenPopup / datcase$nopenTopic) *
                      #       (datcase$nopenTopic / datcase$nflipCard)) /
                      # ((datcase$nopenPopup / datcase$nopenTopic) +
                      #  (datcase$nopenTopic / datcase$nflipCard))
                      # ),
                      SearchInfo = datcase$nopenTopic / datcase$nflipCard +
                        datcase$nopenPopup / datcase$nopenTopic,
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )
-#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
+dattree$NumTopic <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic)
-dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
+dattree$NumPopup <- ifelse(is.na(dattree$NumPopup), 0, dattree$NumPopup)
 #dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
 #dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)
 get_centrality <- function(case, data) {
  data$start <- data$date.start
  data$complete <- data$date.stop
-  
+
  alog <- activitylog(data[data$case == case, ],
                      case_id     = "case",
                      activity_id = "item",
@ -388,17 +220,16 @@ get_centrality <- function(case, data) {
    igraph::centr_degree(inet, loops = TRUE)$centralization,
    igraph::centr_betw(inet)$centralization)
 }
 # TODO: Move to helper file
 # centrality <- lapply(dattree$case, get_centrality, data = dat)
 # centrality <- do.call(rbind, centrality)
 #
 # save(centrality, file = "results/haum/tmp_centrality.RData")
 load("results/haum/tmp_centrality.RData")
-centrality <- lapply(dattree$case, get_centrality, data = dat)
+#dattree$DegreeCentrality <- centrality[, 2]
-centrality <- do.call(rbind, centrality)
+dattree$BetweenCentrality <- centrality[, 3]
 save(centrality, file = "results/haum/tmp_centrality.RData")
 #dattree$centr_degree <- centrality[, 1]
 #dattree$centr_degree_loops <- centrality[, 2]
 dattree$DegreeCentrality <- centrality[, 2]
 #dattree$BetweenCentrality <- centrality[, 3]
 ## Add average duration per item
@ -413,135 +244,118 @@ tmp$duration <- tmp$max_time - tmp$min_time
 tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case
 dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
 #dattree$AvDurItem <- dattree$AvDurItem / datcase$duration
 rm(tmp)
 summary(dattree)
 plot(dattree[, -1], pch = ".")
-par(mfrow = c(3,4))
+par(mfrow = c(2,4))
 hist(dattree$Duration, breaks = 50, main = "")
 hist(dattree$AvDurItem, breaks = 50, main = "")
-hist(dattree$PropItems, breaks = 50, main = "")
+hist(dattree$NumItems, breaks = 50, main = "")
-hist(dattree$PropTopic, breaks = 50, main = "")
+hist(dattree$NumTopic, breaks = 50, main = "")
-hist(dattree$PropPopup, breaks = 50, main = "")
+hist(dattree$NumPopup, breaks = 50, main = "")
 hist(dattree$PropMoves, breaks = 50, main = "")
 hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
-hist(dattree$centr_degree, breaks = 50, main = "")
+hist(dattree$BetweenCentrality, breaks = 50, main = "")
-hist(dattree$centr_degree_loops, breaks = 50, main = "")
+
-hist(dattree$centr_between, breaks = 50, main = "")
+# Indicator variable if table was used as info terminal only
 dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, labels = c("no", "yes"))
 # Add pattern to datcase; loosely based on Bousbia et al. (2009)
 dattree$Pattern <- "Dispersion"
 dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar",
                          dattree$Pattern)
 dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
                          dattree$BetweenCentrality > 0.5, "Star",
                          dattree$Pattern)
 dattree$Pattern <- factor(dattree$Pattern)
-cor_mat <- cor(dattree[, -1], use = "pairwise")
+# Remove cases with extreme outliers
-diag(cor_mat) <- NA
+# TODO: Do I want this???
 heatmap(cor_mat)
 quantile(datcase$nopenTopic, 0.999)
 quantile(datcase$nopenPopup, 0.999)
-# dattree$Pattern <- "Dispersion"
+dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ]
-# dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 &
+
-#                           dattree$Singularity > 0.8, "Scholar",
+plot(dattree[, -1], pch = ".")
-#                           dattree$Pattern)
+
-# dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 &
+par(mfrow = c(2,4))
-#                           dattree$centr_between > 0.5, "Star",
+hist(dattree$AvDurItem, breaks = 50, main = "")
-#                           dattree$Pattern)
+hist(dattree$NumItems, breaks = 50, main = "")
 hist(dattree$NumTopic, breaks = 50, main = "")
 hist(dattree$NumPopup, breaks = 50, main = "")
 hist(dattree$PropMoves, breaks = 50, main = "")
 hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
 hist(dattree$BetweenCentrality, breaks = 50, main = "")
 write.table(dattree,
            file = "results/haum/dattree.csv",
            sep = ";",
            quote = FALSE,
            row.names = FALSE)
 #--------------- (2) Clustering ---------------
-df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
+library(cluster)
 #df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
 df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
 df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
 df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
-# scale Duration and min/max SearchInfo
+df <- dattree[1:10000, -1] # remove case variable
-#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
+# TODO: Do I need to scale or does normalization also work?
-df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
+
-  (max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
+# Normalize Duration and Numbers
-#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
+# df$AvDurItem <- normalize(df$AvDurItem)
-#  (max(df$SearchInfo) - min(df$SearchInfo))
+# df$NumItems  <- normalize(df$NumItems)
-df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
+# df$NumTopic  <- normalize(df$NumTopic)
-  (max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE))
+# df$NumPopup  <- normalize(df$NumPopup)
-df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
+
-  (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))
+# summary(df)
 # Look at collinearity
 cor_mat <- cor(df)
 diag(cor_mat) <- NA
 heatmap(cor_mat)
 #df <- as.data.frame(scale(dattree[, -1]))
-#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+#--------------- (2.2) Hierarchical clustering ---------------
 df <- dattree[, -1]
 df$AvDurItem <- normalize(df$AvDurItem)
 df$SearchInfo <- normalize(df$SearchInfo)
 df$InfocardOnly <- datcase$infocardOnly
 summary(df)
 #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# "Flatten" with PCA
+mat <- daisy(df, metric = "gower")
-pc <- prcomp(df)
+
-coor_2d <- as.data.frame(pc$x[, c(1, 2)])
+# "Flatten" with MDS
-coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
+coor_2d <- as.data.frame(cmdscale(mat, k = 2))
 coor_3d <- as.data.frame(cmdscale(mat, k = 3))
 plot(coor_2d)
 rgl::plot3d(coor_3d)
-#--------------- (2.1) K-Means clustering ---------------
+#mat <- dist(df)
-mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
+# https://uc-r.github.io/hc_clustering
 method <- c(average = "average", single = "single", complete = "complete",
            ward = "ward.D2")
-k1 <- kmeans(df, 4)
+hc_method <- function(x) {
  hclust(mat, method = x)
 }
-grp_km <- k1$cluster
+hcs <- lapply(method, hc_method)
-table(grp_km)
+cds <- lapply(hcs, cophenetic)
-
+cors <- sapply(cds, cor, y = mat)
 fviz_cluster(list(data = df, cluster = grp_km),
             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 plot(coor_2d, col = mycols[grp_km])
 rgl::plot3d(coor_3d, col = mycols[grp_km])
 aggregate(. ~ grp_km, df, mean)
 #--------------- (2.2) Hierarchical clustering ---------------
 mat <- dist(df)
 h1 <- hclust(mat, method = "average")
 h2 <- hclust(mat, method = "complete")
 h3 <- hclust(mat, method = "ward.D")
 h4 <- hclust(mat, method = "ward.D2")
 h5 <- hclust(mat, method = "single")
 # Cophenetic Distances, for each linkage (runs quite some time!)
 c1 <- cophenetic(h1)
 c2 <- cophenetic(h2)
 c3 <- cophenetic(h3)
 c4 <- cophenetic(h4)
 c5 <- cophenetic(h5)
 # Correlations
 cor(mat, c1)
 cor(mat, c2)
 cor(mat, c3)
 cor(mat, c4)
 cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
-hc <- h4
+hc <- hcs$average
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
 k <- 4
 mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
 grp_hclust <- cutree(hc, k = k)
 table(grp_hclust)
@ -553,17 +367,16 @@ fviz_cluster(list(data = df, cluster = grp_hclust),
             ggtheme = theme_bw())
 plot(coor_2d, col = mycols[grp_hclust])
 legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21)
 rgl::plot3d(coor_3d, col = mycols[grp_hclust])
-table(dattree[grp_hclust == 1, "Pattern"])
+table(datcase[grp_hclust == 1, "Pattern"])
-table(dattree[grp_hclust == 2, "Pattern"])
+table(datcase[grp_hclust == 2, "Pattern"])
-table(dattree[grp_hclust == 3, "Pattern"])
+table(datcase[grp_hclust == 3, "Pattern"])
-table(dattree[grp_hclust == 4, "Pattern"])
+table(datcase[grp_hclust == 4, "Pattern"])
 aggregate(. ~ grp_hclust, df, mean)
 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
          mean)
@ -571,7 +384,7 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
 #--------------- (2.3) DBSCAN clustering ---------------
 library(dbscan)
-d1 <- dbscan(df, eps = .3, minPts = ncol(df) + 1)
+d1 <- dbscan(df, eps = 1, minPts = ncol(df) + 1)
 hullplot(df, d1)
 grp_db <- d1$cluster
@ -579,10 +392,10 @@ table(grp_db)
 kNNdistplot(df, k = ncol(df))
 abline(h = 0.2, col = "red")
-abline(h = 0.06, col = "red")
+abline(h = 1, col = "red")
 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
-             #palette = mycols,
+             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())
@ -595,17 +408,20 @@ rgl::plot3d(coor_3d, col = mycols[grp_db + 1])
 aggregate(. ~ grp_db, df, mean)
-table(dattree[grp_db == 0, "Pattern"])
+table(datcase[grp_db == 0, "Pattern"])
-table(dattree[grp_db == 1, "Pattern"])
+table(datcase[grp_db == 1, "Pattern"])
-table(dattree[grp_db == 2, "Pattern"])
+table(datcase[grp_db == 2, "Pattern"])
-table(dattree[grp_db == 3, "Pattern"])
+table(datcase[grp_db == 3, "Pattern"])
 table(datcase[grp_db == 4, "Pattern"])
 ### Look at selected cases ###########################################
 dattree[grp_db == 0, ]
 tmp <- dat
 tmp$start <- tmp$date.start
 tmp$complete <- tmp$date.stop
-alog <- activitylog(tmp[tmp$case == 30418, ],
+alog <- activitylog(tmp[tmp$case == 15, ],
                    case_id     = "case",
                    activity_id = "item",
                    resource_id = "path",
@ -650,38 +466,23 @@ library(partykit)
 dattree_db <- dattree[grp_db != 0, -1]
 dattree_db$grp <- factor(grp_db[grp_db != 0])
 dattree_db$Pattern <- factor(dattree_db$Pattern)
 c1 <- rpart(grp ~ ., data = dattree_db, method = "class")
 plot(as.party(c1))
-c2 <- rpart(as.factor(grp_db) ~ ., data = dattree[, -1], method = "class")
+c2 <- rpart(as.factor(grp_hclust) ~ ., data = dattree[, -1], method = "class")
 plot(as.party(c2))
 c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
              Pattern, data = dattree_db, method = "class")
 plot(as.party(c1))
 # with conditional tree
-c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.5)
+c2 <- ctree(grp ~ ., data = dattree_db, alpha = 0.05)
 plot(c2)
 c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
            PropMoves + Pattern, data = dattree_db, alpha = 0)
 plot(c3)
 c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
            PropMoves + Pattern, data = dattree_db, alpha = 1)
 plot(c4)
 # with excluded points
-c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0.05)
+c5 <- ctree(factor(grp_db) ~ ., data = dattree[, -1], alpha = 0)
 plot(c5)
 # with excluded points
-c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
+c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 0)
 plot(c6)
 # --> just checking