# 02_descriptives.R # # content: (1) Read data # (2) Descriptives # (3) Process Mining # (3.1) Check data quality # (3.2) Interactions for different artworks # (3.3) Patterns of cases # (3.4) Artwork sequences # (3.5) Topics # # input: results/event_logfiles_2024-02-21_16-07-33.csv # results/raw_logfiles_2024-02-21_16-07-33.csv # output: figures/counts_item_firsttouch.pdf # figures/duration.pdf # figures/heatmap_start.pdf # figures/heatmap_stop.pdf # figures/timeMs.pdf # figures/xycoord.pdf # figures/event-dist.pdf # figures/traceexplore_trace-event.pdf # figures/ra_trace-event.pdf # figures/traceexplore_case-event.pdf # figures/bp_tod.pdf # figures/bp_wd.pdf # figures/bp_wds.pdf # figures/bp_corona.pdf # figures/traceexplore_case-artwork_often080.pdf # # last mod: 2024-04-17 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/") #--------------- (1) Read data --------------- datlogs <- read.table("results/event_logfiles_2024-02-21_16-07-33.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", rep("numeric", 3), "character", "character", rep("numeric", 11), "character", "character"), sep = ";", header = TRUE) datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard", "openTopic", "openPopup")) datraw <- read.table("results/raw_logfiles_2024-02-21_16-07-33.csv", sep = ";", header = TRUE) # Add weekdays to data frame datlogs$weekdays <- factor(weekdays(datlogs$date.start), levels = c("Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"), labels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) ### Number of log files length(unique(datraw$fileId)) # 39767 length(unique(c(datlogs$fileId.start, datlogs$fileId.stop))) # 22789 ### Number of activities nrow(datlogs) table(datlogs$event) proportions(table(datlogs$event)) proportions(table(datlogs$event[datlogs$event != "move"])) ### Time range range(as.Date(datlogs$date.start)) ### Topics per item print(xtabs( ~ item + topic, datlogs), zero = "-") lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE) mat <- t(as.matrix(xtabs( ~ item + topic, datlogs))) mat[mat == 0] <- NA image(mat, axes = F, col = rainbow(100)) #--------------- (2) Descriptives --------------- ### Which item gets touched most often? counts_item <- table(datlogs$item) lattice::barchart(counts_item) items <- unique(datlogs$item) #items <- items[!items %in% c("504", "505")] datart <- mtt::extract_artworks(items, paste0(items, ".xml"), "data/haum/ContentEyevisit/eyevisit_cards_light/") datart <- datart[order(datart$artwork), ] names(counts_item) <- datart$title tmp <- barplot(counts_item, las = 2, ylim = c(0, 60000), border = NA, col = "#434F4F") text(tmp, counts_item + 1000, datart$artwork) ### Which item gets touched most often first? datcase <- datlogs[!duplicated(datlogs$case), ] counts_case <- table(datcase$item) names(counts_case) <- datart$title tmp <- barplot(counts_case, las = 2, border = "white") text(tmp, counts_case + 100, datart$item) counts <- rbind(counts_item, counts_case) pdf("figures/counts_item_firsttouch.pdf", width = 20, height = 10, pointsize = 10) par(mai = c(5, .6, .1, .1)) tmp <- barplot(counts, las = 2, border = NA, col = c("#434F4F", "#FF6900"), ylim = c(0, 65000)) text(tmp, counts_item + counts_case + 1000, datart$artwork) legend("topleft", c("Total interactions", "First interactions"), col = c("#434F4F", "#FF6900"), pch = 15, bty = "n") dev.off() ### Which teasers seem to work well? barplot(table(datlogs$topic), las = 2) ### Dwell times/duration datagg <- aggregate(duration ~ event + item, datlogs, mean) datagg$ds <- datagg$duration / 1000 # in secs lattice::bwplot(ds ~ event, datagg) # without aggregation lattice::bwplot(duration / 1000 / 60 ~ event, datlogs) # in min set.seed(1027) pdf("figures/duration.pdf", width = 5, height = 5, pointsize = 10) lattice::bwplot(I(duration/1000/60) ~ event, datlogs[sample(nrow(datlogs), 100000), ], ylab = "Duration in min") dev.off() ### Move events datmove <- aggregate(cbind(duration, scaleSize, rotationDegree, distance, x.start, y.start, x.stop, y.stop) ~ item, datlogs, mean) hist(log(datlogs$scaleSize)) # --> better interpretable on logscale plot(y.start ~ x.start, datmove, pch = 16, col = "gray") points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datmove$scaleSize) plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y", xlim = c(0, 3840), ylim = c(0, 2160)) with(datmove, text(x.start, y.start, item, col = "gray", cex = 1.5)) with(datmove, arrows(x.start, y.start, x.stop, y.stop, length = 0.07, lwd = 2) ) abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2) datscale <- aggregate(scaleSize ~ item, datlogs, max) plot(y.start ~ x.start, datmove, pch = 16, col = "gray") points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datscale$scaleSize) plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y", xlim = c(0, 3840), ylim = c(0, 2160)) #with(datmove, text(x.stop, y.stop, item)) with(datmove, text(x.start, y.start, item)) ### Are there certain areas of the table that are touched most often? # heatmap cuts <- 100 datlogs$x.start.cat <- cut(datlogs$x.start, cuts) datlogs$y.start.cat <- cut(datlogs$y.start, cuts) tab <- xtabs( ~ x.start.cat + y.start.cat, datlogs) colnames(tab) <- paste0("c", 1:cuts) rownames(tab) <- paste0("c", 1:cuts) heatmap(tab, Rowv = NA, Colv = NA) dattrim <- datlogs[datlogs$x.start < 3840 & datlogs$x.start > 0 & datlogs$y.start < 2160 & datlogs$y.start > 0 & datlogs$x.stop < 3840 & datlogs$x.stop > 0 & datlogs$y.stop < 2160 & datlogs$y.stop > 0, ] cuts <- 100 # 200, 100, 70, ... # start dattrim$x.start.cat <- cut(dattrim$x.start, cuts) dattrim$y.start.cat <- cut(dattrim$y.start, cuts) tab.start <- xtabs( ~ x.start.cat + y.start.cat, dattrim) colnames(tab.start) <- NULL rownames(tab.start) <- NULL pdf("figures/heatmap_start.pdf", width = 5, height = 5, pointsize = 10) heatmap(tab.start, Rowv = NA, Colv = NA) dev.off() # stop dattrim$x.stop.cat <- cut(dattrim$x.stop, cuts) dattrim$y.stop.cat <- cut(dattrim$y.stop, cuts) tab.stop <- xtabs( ~ x.stop.cat + y.stop.cat, dattrim) colnames(tab.stop) <- NULL rownames(tab.stop) <- NULL pdf("figures/heatmap_stop.pdf", width = 5, height = 5, pointsize = 10) heatmap(tab.stop, Rowv = NA, Colv = NA) dev.off() ### How many visitors per day datlogs$date <- as.Date(datlogs$date.start) # Interactions per day datint <- aggregate(case ~ date, datlogs, length) plot(datint, type = "h") # Cases per day datcase <- aggregate(case ~ date, datlogs, function(x) length(unique(x))) plot(datcase, type = "h") # Paths per day datpath <- aggregate(path ~ date, datlogs, function(x) length(unique(x))) plot(datpath, type = "h") plot(path ~ date, datpath, type = "h", col = "#3CB4DC") points(case ~ date, datcase, type = "h") plot(case ~ date, datcase, type = "h", col = "#434F4F") ## weird behavior of timeMs pdf("figures/timeMs.pdf", width = 9, height = 6, pointsize = 10) #par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0)) #plot(timeMs.start ~ as.factor(fileId), datlogs[1:2000,], xlab = "fileId") lattice::bwplot(timeMs.start ~ as.factor(fileId.start), datlogs[1:2000,], xlab = "", scales = list(x = list(rot = 90), y = list(rot = 90))) dev.off() ## x,y-coordinates out of range set.seed(1522) pdf("figures/xycoord.pdf", width = 5, height = 5, pointsize = 10) par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0)) #par(mfrow = c(1, 2)) plot(y.start ~ x.start, datlogs[sample(nrow(datlogs), 10000), ]) abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2) #plot(y.stop ~ x.stop, datlogs) #abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2) legend("bottomleft", "Random sample of 10,000", bg = "white") legend("topleft", "4K-Display: 3840 x 2160", bg = "white") dev.off() ## moves dat001 <- datlogs[which(datlogs$item == "001"), ] index <- as.numeric(as.factor(dat001$path)) cc <- sample(colors(), 100) plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y", xlim = c(0, 3840), ylim = c(0, 2160)) with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop, length = .07, col = cc[index])) plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y", xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray") points(y.start ~ x.start, dat001, xlab = "x", ylab = "y", xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize, col = "blue") cc <- sample(colors(), 70) dat1 <- datlogs[!duplicated(datlogs$item), ] dat1 <- dat1[order(dat1$item), ] plot(y.start ~ x.start, dat1, type = "n", xlim = c(-100, 4500), ylim = c(-100, 2500)) abline(h = c(0, 2160), v = c(0, 3840), col = "lightgray") with(dat1, points(x.start, y.start, col = cc, pch = 16)) with(dat1, points(x.stop, y.stop, col = cc, pch = 16)) with(dat1, arrows(x.start, y.start, x.stop, y.stop, length = .07, col = cc)) # How many events per topic, per path, ... # How many popups per artwork? # Number of events per artwork tab <- xtabs( ~ item + event, datlogs) addmargins(tab) proportions(tab, margin = "item") proportions(tab, margin = "event") cc <- palette.colors(palette = "Okabe-Ito")[c(3,2,4,8)] pdf("figures/event-dist.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0)) barplot(t(proportions(tab, margin = "item")), las = 2, col = c("#78004B", "#3CB4DC", "#91C86E", "#FF6900"), legend.text = levels(datlogs$event), args.legend = list(x = "bottomleft", bg = "white")) dev.off() #lattice::barchart(proportions(tab, margin = "item"), las = 2) # Proportion of events proportions(xtabs( ~ event, datlogs)) # Mean proportion of event per path colMeans(proportions(xtabs( ~ path + event, datlogs), margin = "path")) # Mean proportion of event per item colMeans(proportions(tab, margin = "item")) # Proportion of unclosed events nrow(datlogs[is.na(datlogs$complete), ]) nrow(datlogs[is.na(datlogs$complete), ]) / nrow(datlogs) # Proportion of events spanning more than one log file sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) / nrow(datlogs) #--------------- (3) Process Mining --------------- #--------------- (3.1) Check data quality --------------- datlogs$start <- datlogs$date.start datlogs$complete <- datlogs$date.stop alog <- bupaR::activitylog(datlogs, case_id = "path", activity_id = "event", #resource_id = "case", resource_id = "item", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) alogf <- edeaR::filter_trace_frequency(alog, percentage = 0.9) processmapR::process_map(alogf, # alog, type_nodes = processmapR::frequency("absolute"), sec_nodes = processmapR::frequency("relative"), type_edges = processmapR::frequency("absolute"), sec_edges = processmapR::frequency("relative"), rankdir = "TB") alog_no_move <- alog[alog$event != "move", ] pdf("figures/traceexplore_trace-event.pdf", height = 8, width = 12, pointsize = 10) set.seed(1447) processmapR::trace_explorer(alog_no_move[alog_no_move$path %in% sample(unique(alog_no_move$path), 400),], coverage = 1, type = "frequent", abbreviate = T) dev.off() pdf("figures/ra_trace-event.pdf", height = 8, width = 12, pointsize = 10) ra_no_move <- edeaR::resource_frequency(alog_no_move, "resource-activity") levels(ra_no_move$event) <- c("flipCard", "flipCard", "openTopic", "openPopup") plot(ra_no_move) dev.off() ra <- edeaR::resource_frequency(alog, "resource-activity") plot(ra) heatmap(xtabs(relative_activity ~ artwork + event, ra)) heatmap(xtabs(relative_resource ~ artwork + event, ra_no_move)) heatmap(xtabs(relative_activity ~ artwork + event, ra_no_move)) aggregate(relative_activity ~ event, ra, sum) aggregate(relative_resource ~ artwork, ra, sum) #--------------- (3.2) Interactions for different artworks --------------- # Do interaction patterns for events per trace look different for different # artworks? which.max(table(datlogs$artwork)) which.min(table(datlogs$artwork)) which.min(table(datlogs$artwork)[-c(71,72)]) alog080 <- bupaR::activitylog(datlogs[datlogs$artwork == "080",], case_id = "path", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) processmapR::process_map(alog80, processmapR::frequency("relative")) alog087 <- bupaR::activitylog(datlogs[datlogs$artwork == "087",], case_id = "path", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) processmapR::process_map(alog087, processmapR::frequency("relative")) alog504 <- bupaR::activitylog(datlogs[datlogs$artwork == "504",], case_id = "path", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) processmapR::process_map(alog504, processmapR::frequency("relative")) #--------------- (3.3) Patterns of cases --------------- # What kind of patterns do we have? Are their typical sequences for cases? # Do case patterns look different for ... # ... mornings and afternoons? # ... weekdays and weekends? # ... weekdays for "normal" and school vacation days? # ... pre and post corona? alog <- bupaR::activitylog(datlogs, case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) alog_no_move <- alog[alog$event != "move", ] pdf("figures/traceexplore_case-event.pdf", height = 8, width = 12, pointsize = 10) set.seed(1050) processmapR::trace_explorer(alog_no_move[alog_no_move$path %in% sample(unique(alog_no_move$path), 300),], coverage = 1, type = "frequent", abbreviate = T) dev.off() processmapR::process_map(alog080, processmapR::frequency("relative")) alog087 <- bupaR::activitylog(datlogs[datlogs$artwork == "087",], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog087, processmapR::frequency("relative")) ### Mornings and afternoons datlogs$tod <- ifelse(lubridate::hour(datlogs$start) > 13, "afternoon", "morning") alog <- bupaR::activitylog(datlogs[datlogs$tod == "morning",], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) alog <- bupaR::activitylog(datlogs[datlogs$tod == "afternoon",], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) # Are the same artworks looked at? pdf("figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) barplot(proportions(xtabs( ~ tod + artwork, datlogs), margin = "tod"), #col = cc[1:2], las = 2, beside = TRUE, legend = c("afternoon", "morning"), args.legend = list(x = "topleft")) dev.off() ### Weekdays and weekends datlogs$wd <- ifelse(datlogs$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday") alog <- bupaR::activitylog(datlogs[datlogs$wd == "weekend",], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) alog <- bupaR::activitylog(datlogs[datlogs$wd == "weekday",], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) # Are the same artworks looked at? pdf("figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) barplot(proportions(xtabs( ~ wd + artwork, datlogs), margin = "wd"), las = 2, beside = TRUE, legend = c("weekday", "weekend"), args.legend = list(x = "topleft")) dev.off() ### Weekdays vs. school vacation weekdays datlogs$wds <- ifelse(!is.na(datlogs$vacation), "vacation", "school") datlogs$wds[datlogs$wd == "weekend"] <- NA alog <- bupaR::activitylog(datlogs[which(datlogs$wds == "school"),], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) alog <- bupaR::activitylog(datlogs[which(datlogs$wds == "vacation"),], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) # Are the same artworks looked at? pdf("figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) #barplot(xtabs( ~ wds + artwork, datlogs), las = 2, beside = TRUE, barplot(proportions(xtabs( ~ wds + artwork, datlogs), margin = "wds"), las = 2, beside = TRUE, legend = c("school", "vacation"), args.legend = list(x = "topleft")) dev.off() ### Pre and post Corona datlogs$corona <- ifelse(datlogs$date < "2020-03-14", "pre", "post") alog <- bupaR::activitylog(datlogs[which(datlogs$corona == "pre"),], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) alog <- bupaR::activitylog(datlogs[which(datlogs$corona == "post"),], case_id = "case", activity_id = "event", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) # Are the same artworks looked at? pdf("figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) barplot(proportions(xtabs( ~ corona + artwork, datlogs), margin = "corona"), las = 2, beside = TRUE, legend = c("post", "pre"), args.legend = list(x = "topleft")) dev.off() #--------------- (3.4) Artwork sequences --------------- # Order in which artworks are looked at nart <- 5 # select 5 artworks randomly alog <- bupaR::activitylog(datlogs,#[datlogs$artwork %in% sample(unique(datlogs$artwork), nart), ], case_id = "case", activity_id = "artwork", resource_id = "path", timestamps = c("start", "complete")) #map <- processmapR::process_map(alog, processmapR::frequency("relative")) ## select cases with Vermeer length(unique(datlogs[datlogs$artwork == "080", "case"])) # 12615 case080 <- unique(datlogs[datlogs$artwork == "080", "case"]) tmp <- datlogs[datlogs$case %in% case080, ] table(tmp$artwork) # --> all :) # select the ones most often (I am aiming for 10...) barplot(table(tmp$artwork)) abline(h = 14000, col = "red") which(table(tmp$artwork) > 14000) often080 <- names(which(table(tmp$artwork) > 14000)) alog <- bupaR::activitylog(datlogs[datlogs$artwork %in% often080, ], case_id = "case", activity_id = "artwork", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) pdf("figures/traceexplore_case-artwork_often080.pdf", height = 8, width = 12, pointsize = 10) processmapR::trace_explorer(alog, n_traces = 30, type = "frequent", abbreviate = TRUE) dev.off() #--------------- (3.5) Topics --------------- # Are there certain topics that people are interested in more than others? # Do these topic distributions differ for comparable artworks? alog <- bupaR::activitylog(datlogs[which(datlogs$event == "openTopic"),], case_id = "case", activity_id = "topic", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) # Order of topics for Vermeer # alog080 <- bupaR::activitylog(datlogs[datlogs$artwork == "080",], # case_id = "case", # activity_id = "topic", # resource_id = "path", # timestamps = c("start", "complete")) # # alog080 <- bupaR::activitylog(datlogs[datlogs$artwork == "080",], # case_id = "case", # activity_id = "topicFile", # resource_id = "path", # timestamps = c("start", "complete")) # # #processmapR::process_map(alog080, processmapR::frequency("relative")) # # # Comparable artwork # alog083 <- bupaR::activitylog(datlogs[datlogs$artwork == "083",], # case_id = "case", # activity_id = "topic", # resource_id = "path", # timestamps = c("start", "complete")) # artworks that have the same topics than Vermeer which(rowSums(xtabs( ~ artwork + topic, datlogs[datlogs$topic %in% c("artist", "details", "extra info", "komposition", "licht und farbe", "thema"), ]) != 0) == 6) #037 046 062 080 083 109 for (art in c("037", "046", "062", "080", "083", "109")) { alog <- bupaR::activitylog(datlogs[datlogs$event == "openTopic" & datlogs$artwork == art,], case_id = "case", activity_id = "topic", resource_id = "path", timestamps = c("start", "complete")) processmapR::process_map(alog, processmapR::frequency("relative")) }