From 7d67482e0ca4f73d745ecc9b0cab3fec57fb7376 Mon Sep 17 00:00:00 2001 From: nwickel Date: Wed, 31 Jan 2024 12:09:27 +0100 Subject: [PATCH] Next round of script cleaning --- .../{descriptives_2nd.R => 02_descriptives.R} | 228 +++++++------ ...eate-petrinet.py => 03_create-petrinet.py} | 0 ...checking.py => 04_conformance-checking.py} | 2 - code/{check-traces.R => 05_check-traces.R} | 2 - code/{04_infos-items.py => 07_infos-items.py} | 0 ...item-clustering.R => 08_item-clustering.R} | 0 code/descriptives.R | 301 ------------------ 7 files changed, 124 insertions(+), 409 deletions(-) rename code/{descriptives_2nd.R => 02_descriptives.R} (67%) rename code/{02_create-petrinet.py => 03_create-petrinet.py} (100%) rename code/{03_conformance-checking.py => 04_conformance-checking.py} (99%) rename code/{check-traces.R => 05_check-traces.R} (96%) rename code/{04_infos-items.py => 07_infos-items.py} (100%) rename code/{05_item-clustering.R => 08_item-clustering.R} (100%) delete mode 100644 code/descriptives.R diff --git a/code/descriptives_2nd.R b/code/02_descriptives.R similarity index 67% rename from code/descriptives_2nd.R rename to code/02_descriptives.R index b87e05a..92e4966 100644 --- a/code/descriptives_2nd.R +++ b/code/02_descriptives.R @@ -27,46 +27,66 @@ library(bupaverse) #--------------- (1) Read data --------------- -dat <- read.table("results/haum/event_logfiles_glossar_2023-12-28_09-49-43.csv", - sep = ";", header = TRUE, - colClasses = c("POSIXct", "character", "integer", - "integer", "numeric", "integer", - "character", "character", "character", - "character", "POSIXct", "POSIXct", - "numeric", "numeric", "numeric", - "integer", "character", - rep("numeric", 11), "integer", - "character", "character", "logical", - "logical", "logical", "character", - "character")) +datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) -dat$date <- NULL -# TODO: Remove, after rerunning preprocessing +datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard", + "openTopic", + "openPopup")) -dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup")) +datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", + sep = ";", header = TRUE) # Add weekdays to data frame -dat$weekdays <- factor(weekdays(dat$date.start), +datlogs$weekdays <- factor(weekdays(datlogs$date.start), levels = c("Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"), labels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) +### Number of log files +length(unique(datraw$fileId)) +# 39767 +length(unique(c(datlogs$fileId.start, datlogs$fileId.stop))) +# 22789 + +### Number of activities +nrow(datlogs) +table(datlogs$event) +proportions(table(datlogs$event)) +proportions(table(datlogs$event[datlogs$event != "move"])) + +### Time range +range(as.Date(datlogs$date.start)) + +### Topics per item +print(xtabs( ~ item + topic, datlogs), zero = "-") +lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE) + +mat <- t(as.matrix(xtabs( ~ item + topic, datlogs))) +mat[mat == 0] <- NA +image(mat, axes = F, col = rainbow(100)) +heatmap(t(mat)) + + +datlogs$start <- datlogs$date.start +datlogs$complete <- datlogs$date.stop -names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete") -dat$trail <- dat$trace -dat$trace <- NULL -# --> needs to be changed since "trace" is an inbuilt variable name in -# bupar #--------------- (2) Descriptives --------------- -# How many events per topic, per trace, ... +# How many events per topic, per path, ... # How many popups per artwork? # Number of events per artwork -tab <- xtabs( ~ artwork + event, dat) +tab <- xtabs( ~ artwork + event, datlogs) addmargins(tab) proportions(tab, margin = "artwork") @@ -78,35 +98,35 @@ pdf("../figures/event-dist.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0)) barplot(t(proportions(tab, margin = "artwork")), las = 2, col = cc, - legend.text = levels(dat$event), args.legend = list(x = "bottomleft", bg = "white")) + legend.text = levels(datlogs$event), args.legend = list(x = "bottomleft", bg = "white")) dev.off() #barchart(proportions(tab, margin = "artwork"), las = 2) # Proportion of events -proportions(xtabs( ~ event, dat)) -# Mean proportion of event per trace -colMeans(proportions(xtabs( ~ trail + event, dat), margin = "trail")) +proportions(xtabs( ~ event, datlogs)) +# Mean proportion of event per path +colMeans(proportions(xtabs( ~ path + event, datlogs), margin = "path")) # Mean proportion of event per artwork colMeans(proportions(tab, margin = "artwork")) # Proportion of unclosed events -nrow(dat[is.na(dat$complete), ]) -nrow(dat[is.na(dat$complete), ]) / nrow(dat) +nrow(datlogs[is.na(datlogs$complete), ]) +nrow(datlogs[is.na(datlogs$complete), ]) / nrow(datlogs) # Proportion of events spanning more than one log file -sum(dat$fileId.start != dat$fileId.stop, na.rm = TRUE) -sum(dat$fileId.start != dat$fileId.stop, na.rm = TRUE) / nrow(dat) +sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) +sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) / nrow(datlogs) #--------------- (3) Process Mining --------------- #--------------- (3.1) Check data quality --------------- -alog <- activitylog(dat, - case_id = "trail", +alog <- activitylog(datlogs, + case_id = "path", activity_id = "event", #resource_id = "case", resource_id = "artwork", @@ -128,8 +148,8 @@ alog_no_move <- alog[alog$event != "move", ] pdf("../figures/traceexplore_trace-event.pdf", height = 8, width = 12, pointsize = 10) set.seed(1447) -processmapR::trace_explorer(alog_no_move[alog_no_move$trail %in% - sample(unique(alog_no_move$trail), 400),], +processmapR::trace_explorer(alog_no_move[alog_no_move$path %in% + sample(unique(alog_no_move$path), 400),], coverage = 1, type = "frequent", abbreviate = T) dev.off() @@ -157,28 +177,28 @@ aggregate(relative_resource ~ artwork, ra, sum) # Do interaction patterns for events per trace look different for different # artworks? -which.max(table(dat$artwork)) -which.min(table(dat$artwork)) -which.min(table(dat$artwork)[-c(71,72)]) +which.max(table(datlogs$artwork)) +which.min(table(datlogs$artwork)) +which.min(table(datlogs$artwork)[-c(71,72)]) -alog080 <- activitylog(dat[dat$artwork == "080",], - case_id = "trail", +alog080 <- activitylog(datlogs[datlogs$artwork == "080",], + case_id = "path", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf") -alog087 <- activitylog(dat[dat$artwork == "087",], - case_id = "trail", +alog087 <- activitylog(datlogs[datlogs$artwork == "087",], + case_id = "path", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf") -alog504 <- activitylog(dat[dat$artwork == "504",], - case_id = "trail", +alog504 <- activitylog(datlogs[datlogs$artwork == "504",], + case_id = "path", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) @@ -194,10 +214,10 @@ map_as_pdf(alog504, file = "../figures/pm_trace-event_504.pdf") # ... weekdays for "normal" and school vacation days? # ... pre and post corona? -alog <- activitylog(dat, +alog <- activitylog(datlogs, case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event.pdf") @@ -206,38 +226,38 @@ alog_no_move <- alog[alog$event != "move", ] pdf("../figures/traceexplore_case-event.pdf", height = 8, width = 12, pointsize = 10) set.seed(1050) -processmapR::trace_explorer(alog_no_move[alog_no_move$trail %in% - sample(unique(alog_no_move$trail), 300),], +processmapR::trace_explorer(alog_no_move[alog_no_move$path %in% + sample(unique(alog_no_move$path), 300),], coverage = 1, type = "frequent", abbreviate = T) dev.off() map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf") -alog087 <- activitylog(dat[dat$artwork == "087",], +alog087 <- activitylog(datlogs[datlogs$artwork == "087",], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf") ### Mornings and afternoons -dat$tod <- ifelse(lubridate::hour(dat$start) > 13, "afternoon", "morning") +datlogs$tod <- ifelse(lubridate::hour(datlogs$start) > 13, "afternoon", "morning") -alog <- activitylog(dat[dat$tod == "morning",], +alog <- activitylog(datlogs[datlogs$tod == "morning",], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf") -alog <- activitylog(dat[dat$tod == "afternoon",], +alog <- activitylog(datlogs[datlogs$tod == "afternoon",], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf") @@ -246,7 +266,7 @@ map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf") pdf("../figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) -barplot(proportions(xtabs( ~ tod + artwork, dat), margin = "tod"), #col = cc[1:2], +barplot(proportions(xtabs( ~ tod + artwork, datlogs), margin = "tod"), #col = cc[1:2], las = 2, beside = TRUE, legend = c("afternoon", "morning"), args.legend = list(x = "topleft")) @@ -254,20 +274,20 @@ dev.off() ### Weekdays and weekends -dat$wd <- ifelse(dat$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday") +datlogs$wd <- ifelse(datlogs$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday") -alog <- activitylog(dat[dat$wd == "weekend",], +alog <- activitylog(datlogs[datlogs$wd == "weekend",], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf") -alog <- activitylog(dat[dat$wd == "weekday",], +alog <- activitylog(datlogs[datlogs$wd == "weekday",], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf") @@ -276,7 +296,7 @@ map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf") pdf("../figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) -barplot(proportions(xtabs( ~ wd + artwork, dat), margin = "wd"), +barplot(proportions(xtabs( ~ wd + artwork, datlogs), margin = "wd"), las = 2, beside = TRUE, legend = c("weekday", "weekend"), args.legend = list(x = "topleft")) @@ -284,21 +304,21 @@ dev.off() ### Weekdays vs. school vacation weekdays -dat$wds <- ifelse(!is.na(dat$vacation), "vacation", "school") -dat$wds[dat$wd == "weekend"] <- NA +datlogs$wds <- ifelse(!is.na(datlogs$vacation), "vacation", "school") +datlogs$wds[datlogs$wd == "weekend"] <- NA -alog <- activitylog(dat[which(dat$wds == "school"),], +alog <- activitylog(datlogs[which(datlogs$wds == "school"),], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf") -alog <- activitylog(dat[which(dat$wds == "vacation"),], +alog <- activitylog(datlogs[which(datlogs$wds == "vacation"),], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf") @@ -307,8 +327,8 @@ map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf") pdf("../figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) -#barplot(xtabs( ~ wds + artwork, dat), las = 2, beside = TRUE, -barplot(proportions(xtabs( ~ wds + artwork, dat), margin = "wds"), +#barplot(xtabs( ~ wds + artwork, datlogs), las = 2, beside = TRUE, +barplot(proportions(xtabs( ~ wds + artwork, datlogs), margin = "wds"), las = 2, beside = TRUE, legend = c("school", "vacation"), args.legend = list(x = "topleft")) @@ -316,20 +336,20 @@ dev.off() ### Pre and post Corona -dat$corona <- ifelse(dat$date < "2020-03-14", "pre", "post") +datlogs$corona <- ifelse(datlogs$date < "2020-03-14", "pre", "post") -alog <- activitylog(dat[which(dat$corona == "pre"),], +alog <- activitylog(datlogs[which(datlogs$corona == "pre"),], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf") -alog <- activitylog(dat[which(dat$corona == "post"),], +alog <- activitylog(datlogs[which(datlogs$corona == "post"),], case_id = "case", activity_id = "event", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf") @@ -338,7 +358,7 @@ map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf") pdf("../figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) -barplot(proportions(xtabs( ~ corona + artwork, dat), margin = "corona"), +barplot(proportions(xtabs( ~ corona + artwork, datlogs), margin = "corona"), las = 2, beside = TRUE, legend = c("post", "pre"), args.legend = list(x = "topleft")) @@ -348,19 +368,19 @@ dev.off() # Order in which artworks are looked at nart <- 5 # select 5 artworks randomly -alog <- activitylog(dat,#[dat$artwork %in% sample(unique(dat$artwork), nart), ], +alog <- activitylog(datlogs,#[datlogs$artwork %in% sample(unique(datlogs$artwork), nart), ], case_id = "case", activity_id = "artwork", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) #map <- process_map(alog, frequency("relative")) ## select cases with Vermeer -length(unique(dat[dat$artwork == "080", "case"])) +length(unique(datlogs[datlogs$artwork == "080", "case"])) # 12615 -case080 <- unique(dat[dat$artwork == "080", "case"]) -tmp <- dat[dat$case %in% case080, ] +case080 <- unique(datlogs[datlogs$artwork == "080", "case"]) +tmp <- datlogs[datlogs$case %in% case080, ] table(tmp$artwork) # --> all :) @@ -371,10 +391,10 @@ which(table(tmp$artwork) > 14000) often080 <- names(which(table(tmp$artwork) > 14000)) -alog <- activitylog(dat[dat$artwork %in% often080, ], +alog <- activitylog(datlogs[datlogs$artwork %in% often080, ], case_id = "case", activity_id = "artwork", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf") @@ -393,43 +413,43 @@ dev.off() # Are there certain topics that people are interested in more than others? # Do these topic distributions differ for comparable artworks? -alog <- activitylog(dat[which(dat$event == "openTopic"),], +alog <- activitylog(datlogs[which(datlogs$event == "openTopic"),], case_id = "case", activity_id = "topic", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = "../figures/pm_case-topic.pdf") # Order of topics for Vermeer -# alog080 <- activitylog(dat[dat$artwork == "080",], +# alog080 <- activitylog(datlogs[datlogs$artwork == "080",], # case_id = "case", # activity_id = "topic", -# resource_id = "trail", +# resource_id = "path", # timestamps = c("start", "complete")) # # map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf") # # -# alog080 <- activitylog(dat[dat$artwork == "080",], +# alog080 <- activitylog(datlogs[datlogs$artwork == "080",], # case_id = "case", # activity_id = "topicFile", -# resource_id = "trail", +# resource_id = "path", # timestamps = c("start", "complete")) # # #process_map(alog080, frequency("relative")) # # # Comparable artwork -# alog083 <- activitylog(dat[dat$artwork == "083",], +# alog083 <- activitylog(datlogs[datlogs$artwork == "083",], # case_id = "case", # activity_id = "topic", -# resource_id = "trail", +# resource_id = "path", # timestamps = c("start", "complete")) # # map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf") # artworks that have the same topics than Vermeer -which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in% +which(rowSums(xtabs( ~ artwork + topic, datlogs[datlogs$topic %in% c("artist", "details", "extra info", "komposition", "licht und farbe", "thema"), ]) != 0) == 6) @@ -437,10 +457,10 @@ which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in% for (art in c("037", "046", "062", "080", "083", "109")) { - alog <- activitylog(dat[dat$event == "openTopic" & dat$artwork == art,], + alog <- activitylog(datlogs[datlogs$event == "openTopic" & datlogs$artwork == art,], case_id = "case", activity_id = "topic", - resource_id = "trail", + resource_id = "path", timestamps = c("start", "complete")) map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf")) @@ -467,11 +487,11 @@ c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde", # BURSTS -which.max(table(dat$date)) -tmp <- dat[dat$date == "2017-02-12", ] +which.max(table(datlogs$date)) +tmp <- datlogs[datlogs$date == "2017-02-12", ] # number of traces per case on 2017-02-12 -rowSums(xtabs( ~ case + trail, tmp) != 0) +rowSums(xtabs( ~ case + path, tmp) != 0) range(tmp$start) hours <- lubridate::hour(tmp$start) @@ -481,20 +501,20 @@ xtabs( ~ case + hours, tmp) colSums(xtabs( ~ case + hours, tmp) != 0) barplot(colSums(xtabs( ~ case + hours, tmp) != 0)) -aggregate(trail ~ case + hours, tmp, length) +aggregate(path ~ case + hours, tmp, length) -tmp <- aggregate(trail ~ case, dat, length) -tmp$date <- as.Date(dat[!duplicated(dat$case), "start"]) -tmp$time <- lubridate::hour(dat[!duplicated(dat$case), "start"]) +tmp <- aggregate(path ~ case, datlogs, length) +tmp$date <- as.Date(datlogs[!duplicated(datlogs$case), "start"]) +tmp$time <- lubridate::hour(datlogs[!duplicated(datlogs$case), "start"]) -tmp[tmp$trail > 200, ] +tmp[tmp$path > 200, ] -plot(trail ~ time, tmp, cex = 2, col = rgb(0,0,0,.3)) +plot(path ~ time, tmp, cex = 2, col = rgb(0,0,0,.3)) -lattice::barchart(trail ~ time, tmp, horizontal=F) +lattice::barchart(path ~ time, tmp, horizontal=F) diff --git a/code/02_create-petrinet.py b/code/03_create-petrinet.py similarity index 100% rename from code/02_create-petrinet.py rename to code/03_create-petrinet.py diff --git a/code/03_conformance-checking.py b/code/04_conformance-checking.py similarity index 99% rename from code/03_conformance-checking.py rename to code/04_conformance-checking.py index 31da40f..f8401d5 100644 --- a/code/03_conformance-checking.py +++ b/code/04_conformance-checking.py @@ -1,8 +1,6 @@ import pm4py - import pandas as pd import numpy as np - from python_helpers import eval_pm, pn_infos_miner ###### Load data and create event logs ###### diff --git a/code/check-traces.R b/code/05_check-traces.R similarity index 96% rename from code/check-traces.R rename to code/05_check-traces.R index 8665c53..d640c14 100644 --- a/code/check-traces.R +++ b/code/05_check-traces.R @@ -1,5 +1,3 @@ -# TODO: Clean me up! I am a mix of useful and useless!!! - # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") #--------------- (1) Look at broken trace --------------- diff --git a/code/04_infos-items.py b/code/07_infos-items.py similarity index 100% rename from code/04_infos-items.py rename to code/07_infos-items.py diff --git a/code/05_item-clustering.R b/code/08_item-clustering.R similarity index 100% rename from code/05_item-clustering.R rename to code/08_item-clustering.R diff --git a/code/descriptives.R b/code/descriptives.R deleted file mode 100644 index 218c090..0000000 --- a/code/descriptives.R +++ /dev/null @@ -1,301 +0,0 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/") - -library(lattice) -#library(mtt) -devtools::load_all("../../../../software/mtt") - -# Achims colors (used by lattice) -#cc <- palette.colors(palette = "Okabe-Ito") -#plot(1:10, col = cc, pch = 16, cex = 2) - -# Read data - -datlogs <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv", - colClasses = c("character", "character", "POSIXct", - "POSIXct", "character", "integer", - "numeric", "character", "character", - rep("numeric", 3), "character", - "character", rep("numeric", 11), - "character", "character"), - sep = ";", header = TRUE) - -datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard", - "openTopic", - "openPopup")) - -### Number of log files -length(unique(datlogs$fileId.start)) -length(unique(datlogs$fileId.stop)) - -length(unique(c(datlogs$fileId.start, datlogs$fileId.stop))) -# 22803 - -### Number of activities -nrow(datlogs) -table(datlogs$event) -proportions(table(datlogs$event)) -proportions(table(datlogs$event[datlogs$event != "move"])) - -### Time range -range(as.Date(datlogs$date.start)) - -### Topics per item -print(xtabs( ~ item + topic, datlogs), zero = "-") -lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE) - -### Which item gets touched most often? - -counts_item <- table(datlogs$item) -barchart(counts_item) - -items <- unique(datlogs$item) -#items <- items[!items %in% c("504", "505")] -datart <- extract_artworks(items, - paste0(items, ".xml"), - "../data/haum/ContentEyevisit/eyevisit_cards_light/") -datart <- datart[order(datart$artwork), ] -names(counts_item) <- datart$title - -pdf("../figures/counts_item.pdf", width = 20, height = 10, pointsize = 10) -par(mai = c(5, .6, .1, .1)) -tmp <- barplot(counts_item, las = 2, ylim = c(0, 60000), - border = "white", col = "#3CB4DC") -text(tmp, counts_item + 1000, datart$artwork) -dev.off() - -### Which item gets touched most often first? - -datcase <- datlogs[!duplicated(datlogs$case), ] -counts_case <- table(datcase$item) -names(counts_case) <- datart$title -tmp <- barplot(counts_case, las = 2, border = "white") -text(tmp, counts_case + 100, datart$item) - -counts <- rbind(counts_item, counts_case) -barplot(counts, las = 2, border = "white", col = c("gray", "darkorange")) - -### Which teasers seem to work well? -barplot(table(datlogs$topic), las = 2) - -### Dwell times/duration -datagg <- aggregate(duration ~ event + item, datlogs, mean) -datagg$ds <- datagg$duration / 1000 - -bwplot(ds ~ event, datagg) -bwplot(ds ~ event | item, datagg) -xyplot(ds ~ event, datagg, groups = item) - -# without aggregation -bwplot(duration ~ event, datlogs) -# in min - -set.seed(1027) - -pdf("../figures/duration.pdf", width = 5, height = 5, pointsize = 10) -bwplot(I(duration/1000/60) ~ event, datlogs[sample(nrow(datlogs), 100000), ], - ylab = "Duration in min") -dev.off() - -### Move events - -datmove <- aggregate(cbind(duration, scaleSize, rotationDegree, distance, x.start, - y.start, x.stop, y.stop) ~ item, datlogs, - mean) - -hist(log(datlogs$scaleSize)) -# --> better interpretable on logscale - -plot(y.start ~ x.start, datmove, pch = 16, col = "gray") -points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datmove$scaleSize) - - -plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y", - xlim = c(0, 3840), ylim = c(0, 2160)) -with(datmove, text(x.start, y.start, item, col = "gray", cex = 1.5)) -with(datmove, - arrows(x.start, y.start, x.stop, y.stop, length = 0.07, lwd = 2) - ) -abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2) - -datscale <- aggregate(scaleSize ~ item, datlogs, max) -plot(y.start ~ x.start, datmove, pch = 16, col = "gray") -points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datscale$scaleSize) - -plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y", - xlim = c(0, 3840), ylim = c(0, 2160)) -#with(datmove, text(x.stop, y.stop, item)) -with(datmove, text(x.start, y.start, item)) - - -### Are there certain areas of the table that are touched most often? - -# heatmap -cuts <- 100 - -datlogs$x.start.cat <- cut(datlogs$x.start, cuts) -datlogs$y.start.cat <- cut(datlogs$y.start, cuts) - -tab <- xtabs( ~ x.start.cat + y.start.cat, datlogs) - -colnames(tab) <- paste0("c", 1:cuts) -rownames(tab) <- paste0("c", 1:cuts) - -heatmap(tab, Rowv = NA, Colv = NA) - - -library(ggplot2) - -ggplot(as.data.frame(tab)) + - geom_tile(aes(x = x.start.cat, y = y.start.cat, fill = Freq)) + - scale_fill_gradient(low = "gray40", high = "orange") - -dattrim <- datlogs[datlogs$x.start < 3840 & - datlogs$x.start > 0 & - datlogs$y.start < 2160 & - datlogs$y.start > 0 & - datlogs$x.stop < 3840 & - datlogs$x.stop > 0 & - datlogs$y.stop < 2160 & - datlogs$y.stop > 0, ] - -cuts <- 100 # 200, 100, 70, ... - -# start -dattrim$x.start.cat <- cut(dattrim$x.start, cuts) -dattrim$y.start.cat <- cut(dattrim$y.start, cuts) - -tab.start <- xtabs( ~ x.start.cat + y.start.cat, dattrim) -colnames(tab.start) <- NULL -rownames(tab.start) <- NULL - -pdf("../figures/heatmap_start.pdf", width = 5, height = 5, pointsize = 10) -heatmap(tab.start, Rowv = NA, Colv = NA) -dev.off() - - -my_colors <- colorRampPalette(c("#009E73", "#E69F00")) -heatmap(tab.start, Rowv = NA, Colv = NA, col = my_colors(1000)) - -ggplot(as.data.frame(tab.start)) + - geom_tile(aes(x = x.start.cat, y = y.start.cat, fill = Freq)) + - scale_fill_gradient(low = "#009E73", high = "#E69F00") - -# stop -dattrim$x.stop.cat <- cut(dattrim$x.stop, cuts) -dattrim$y.stop.cat <- cut(dattrim$y.stop, cuts) -tab.stop <- xtabs( ~ x.stop.cat + y.stop.cat, dattrim) -colnames(tab.stop) <- NULL -rownames(tab.stop) <- NULL - -pdf("../figures/heatmap_stop.pdf", width = 5, height = 5, pointsize = 10) -heatmap(tab.stop, Rowv = NA, Colv = NA) -dev.off() - -heatmap(tab.stop, Rowv = NA, Colv = NA, col = my_colors(10)) - -### How many visitors per day - -datlogs$date <- as.Date(datlogs$date.start) - -# Interactions per day -datint <- aggregate(case ~ date, datlogs, length) -plot(datint, type = "h") - -# Cases per day -datcase <- aggregate(case ~ date, datlogs, function(x) length(unique(x))) -plot(datcase, type = "h") - -# Paths per day -datpath <- aggregate(path ~ date, datlogs, function(x) length(unique(x))) -plot(datpath, type = "h") - -plot(path ~ date, datpath, type = "h", col = "#3CB4DC") -points(case ~ date, datcase, type = "h") - -pdf("../figures/cases_per_day2.pdf", width = 9, height = 5, pointsize = 10) -par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0)) -plot(case ~ date, datcase, type = "h", col = "#3CB4DC") -abline(v = datcase$date[datcase$date %in% c("2020-03-13", "2022-10-25")], - col = "#FF6900", lty = 2) -text(datcase$date[datcase$date == "2020-03-13"]+470, 80, - "Corona gap from 2020-03-13 to 2022-10-25", - col = "#D55E00") -dev.off() - - -### Other stuff - -## function dependencies of mtt -devtools::load_all("../../../../software/mtt") -#library(mtt) - -library(mvbutils) -foodweb(where = "package:mtt") - -pdf("../figures/fun_depend_mtt.pdf", width = 8, height = 4, pointsize = 10) -foodweb(where = "package:mtt", - prune = c("parse_logfiles", "create_eventlogs", "extract_artworks", - "extract_topics"), - #expand.ybox = 1.8, #cex = .6, - #border = TRUE, - #boxcolor = "gray", - color.lines = FALSE, - lwd = 2, mai = c(0, 0, 0, 0)) -dev.off() - - -## weird behavior of timeMs - -pdf("../figures/timeMs.pdf", width = 9, height = 6, pointsize = 10) -#par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0)) -#plot(timeMs.start ~ as.factor(fileId), datlogs[1:2000,], xlab = "fileId") -bwplot(timeMs.start ~ as.factor(fileId.start), datlogs[1:2000,], xlab = "", - scales = list(x = list(rot = 90), y = list(rot = 90))) -dev.off() - -## x,y-coordinates out of range - -set.seed(1522) - -pdf("../figures/xycoord.pdf", width = 5, height = 5, pointsize = 10) -par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0)) -#par(mfrow = c(1, 2)) -plot(y.start ~ x.start, datlogs[sample(nrow(datlogs), 10000), ]) -abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2) -#plot(y.stop ~ x.stop, datlogs) -#abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2) -legend("bottomleft", "Random sample of 10,000", bg = "white") -legend("topleft", "4K-Display: 3840 x 2160", bg = "white") -dev.off() - -## moves - -dat001 <- datlogs[which(datlogs$item == "001"), ] - -index <- as.numeric(as.factor(dat001$path)) -cc <- sample(colors(), 100) - -plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y", - xlim = c(0, 3840), ylim = c(0, 2160)) -with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop, - length = .07, col = cc[index])) - -plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y", - xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray") -points(y.start ~ x.start, dat001, xlab = "x", ylab = "y", - xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize, - col = "blue") - - - -cc <- sample(colors(), 70) - -dat1 <- datlogs[!duplicated(datlogs$item), ] -dat1 <- dat1[order(dat1$item), ] - -plot(y.start ~ x.start, dat1, type = "n", xlim = c(-100, 4500), ylim = c(-100, 2500)) -abline(h = c(0, 2160), v = c(0, 3840), col = "lightgray") -with(dat1, points(x.start, y.start, col = cc, pch = 16)) -with(dat1, points(x.stop, y.stop, col = cc, pch = 16)) -with(dat1, arrows(x.start, y.start, x.stop, y.stop, length = .07, col = cc)) -