mtt_haum/code/02_descriptives.R

760 lines
25 KiB
R
Raw Normal View History

2024-02-13 09:03:35 +01:00
# 02_descriptives.R
2023-11-01 18:46:39 +01:00
#
# content: (1) Read data
2023-11-01 18:46:39 +01:00
# (2) Descriptives
# (3) Process Mining
# (3.1) Check data quality
# (3.2) Interactions for different artworks
# (3.3) Patterns of cases
# (3.4) Artwork sequences
# (3.5) Topics
2023-11-01 18:46:39 +01:00
#
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
# results/haum/raw_logfiles_2024-02-21_16-07-33.csv
# output:
2023-11-01 18:46:39 +01:00
#
2024-03-13 18:14:57 +01:00
# last mod: 2024-03-13
2023-11-01 18:46:39 +01:00
2024-03-13 18:14:57 +01:00
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
2023-11-01 18:46:39 +01:00
library(lattice)
library(bupaverse)
#library(mtt)
devtools::load_all("../../../../../software/mtt")
2023-11-01 18:46:39 +01:00
# Overall Research Question: How do museum visitors interact with the
# artworks presented on the MTT?
# Distribution of bursts
# Can this be visualized in a nice way?
#--------------- (1) Read data ---------------
datlogs <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv",
2024-01-31 12:09:27 +01:00
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
rep("numeric", 3), "character",
"character", rep("numeric", 11),
"character", "character"),
sep = ";", header = TRUE)
datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard",
"openTopic",
"openPopup"))
2024-03-13 18:14:57 +01:00
datraw <- read.table("results/haum/raw_logfiles_2024-02-21_16-07-33.csv",
2024-01-31 12:09:27 +01:00
sep = ";", header = TRUE)
2023-11-01 18:46:39 +01:00
# Add weekdays to data frame
2024-01-31 12:09:27 +01:00
datlogs$weekdays <- factor(weekdays(datlogs$date.start),
2023-11-01 18:46:39 +01:00
levels = c("Montag", "Dienstag", "Mittwoch",
"Donnerstag", "Freitag", "Samstag",
"Sonntag"),
labels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday",
"Sunday"))
2024-03-13 18:14:57 +01:00
2024-01-31 12:09:27 +01:00
### Number of log files
length(unique(datraw$fileId))
# 39767
length(unique(c(datlogs$fileId.start, datlogs$fileId.stop)))
# 22789
### Number of activities
nrow(datlogs)
table(datlogs$event)
proportions(table(datlogs$event))
proportions(table(datlogs$event[datlogs$event != "move"]))
### Time range
range(as.Date(datlogs$date.start))
### Topics per item
print(xtabs( ~ item + topic, datlogs), zero = "-")
lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE)
mat <- t(as.matrix(xtabs( ~ item + topic, datlogs)))
mat[mat == 0] <- NA
image(mat, axes = F, col = rainbow(100))
#--------------- (2) Descriptives ---------------
2024-01-31 12:09:27 +01:00
### Which item gets touched most often?
2023-11-01 18:46:39 +01:00
counts_item <- table(datlogs$item)
lattice::barchart(counts_item)
items <- unique(datlogs$item)
#items <- items[!items %in% c("504", "505")]
datart <- extract_artworks(items,
paste0(items, ".xml"),
"../data/haum/ContentEyevisit/eyevisit_cards_light/")
datart <- datart[order(datart$artwork), ]
names(counts_item) <- datart$title
pdf("results/figures/counts_item.pdf", width = 20, height = 10, pointsize = 10)
par(mai = c(5, .6, .1, .1))
tmp <- barplot(counts_item, las = 2, ylim = c(0, 60000),
border = NA, col = "#434F4F")
text(tmp, counts_item + 1000, datart$artwork)
dev.off()
### Which item gets touched most often first?
datcase <- datlogs[!duplicated(datlogs$case), ]
counts_case <- table(datcase$item)
names(counts_case) <- datart$title
tmp <- barplot(counts_case, las = 2, border = "white")
text(tmp, counts_case + 100, datart$item)
counts <- rbind(counts_item, counts_case)
pdf("results/figures/counts_item_firsttouch.pdf",
width = 20, height = 10, pointsize = 10)
par(mai = c(5, .6, .1, .1))
tmp <- barplot(counts, las = 2, border = NA, col = c("#434F4F", "#FF6900"), ylim = c(0, 65000))
text(tmp, counts_item + counts_case + 1000, datart$artwork)
legend("topleft", c("Total interactions", "First interactions"),
col = c("#434F4F", "#FF6900"), pch = 15, bty = "n")
dev.off()
### Which teasers seem to work well?
barplot(table(datlogs$topic), las = 2)
### Dwell times/duration
datagg <- aggregate(duration ~ event + item, datlogs, mean)
datagg$ds <- datagg$duration / 1000 # in secs
lattice::bwplot(ds ~ event, datagg)
# without aggregation
lattice::bwplot(duration / 1000 / 60 ~ event, datlogs)
# in min
set.seed(1027)
pdf("results/figures/duration.pdf", width = 5, height = 5, pointsize = 10)
lattice::bwplot(I(duration/1000/60) ~ event, datlogs[sample(nrow(datlogs), 100000), ],
ylab = "Duration in min")
dev.off()
### Move events
datmove <- aggregate(cbind(duration, scaleSize, rotationDegree, distance, x.start,
y.start, x.stop, y.stop) ~ item, datlogs,
mean)
hist(log(datlogs$scaleSize))
# --> better interpretable on logscale
plot(y.start ~ x.start, datmove, pch = 16, col = "gray")
points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datmove$scaleSize)
plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
with(datmove, text(x.start, y.start, item, col = "gray", cex = 1.5))
with(datmove,
arrows(x.start, y.start, x.stop, y.stop, length = 0.07, lwd = 2)
)
abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2)
datscale <- aggregate(scaleSize ~ item, datlogs, max)
plot(y.start ~ x.start, datmove, pch = 16, col = "gray")
points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datscale$scaleSize)
plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
#with(datmove, text(x.stop, y.stop, item))
with(datmove, text(x.start, y.start, item))
### Are there certain areas of the table that are touched most often?
# heatmap
cuts <- 100
datlogs$x.start.cat <- cut(datlogs$x.start, cuts)
datlogs$y.start.cat <- cut(datlogs$y.start, cuts)
tab <- xtabs( ~ x.start.cat + y.start.cat, datlogs)
colnames(tab) <- paste0("c", 1:cuts)
rownames(tab) <- paste0("c", 1:cuts)
heatmap(tab, Rowv = NA, Colv = NA)
dattrim <- datlogs[datlogs$x.start < 3840 &
datlogs$x.start > 0 &
datlogs$y.start < 2160 &
datlogs$y.start > 0 &
datlogs$x.stop < 3840 &
datlogs$x.stop > 0 &
datlogs$y.stop < 2160 &
datlogs$y.stop > 0, ]
cuts <- 100 # 200, 100, 70, ...
# start
dattrim$x.start.cat <- cut(dattrim$x.start, cuts)
dattrim$y.start.cat <- cut(dattrim$y.start, cuts)
tab.start <- xtabs( ~ x.start.cat + y.start.cat, dattrim)
colnames(tab.start) <- NULL
rownames(tab.start) <- NULL
pdf("results/figures/heatmap_start.pdf", width = 5, height = 5, pointsize = 10)
heatmap(tab.start, Rowv = NA, Colv = NA)
dev.off()
# stop
dattrim$x.stop.cat <- cut(dattrim$x.stop, cuts)
dattrim$y.stop.cat <- cut(dattrim$y.stop, cuts)
tab.stop <- xtabs( ~ x.stop.cat + y.stop.cat, dattrim)
colnames(tab.stop) <- NULL
rownames(tab.stop) <- NULL
pdf("results/figures/heatmap_stop.pdf", width = 5, height = 5, pointsize = 10)
heatmap(tab.stop, Rowv = NA, Colv = NA)
dev.off()
### How many visitors per day
datlogs$date <- as.Date(datlogs$date.start)
# Interactions per day
datint <- aggregate(case ~ date, datlogs, length)
plot(datint, type = "h")
# Cases per day
datcase <- aggregate(case ~ date, datlogs, function(x) length(unique(x)))
plot(datcase, type = "h")
# Paths per day
datpath <- aggregate(path ~ date, datlogs, function(x) length(unique(x)))
plot(datpath, type = "h")
plot(path ~ date, datpath, type = "h", col = "#3CB4DC")
points(case ~ date, datcase, type = "h")
pdf("results/figures/cases_per_day.pdf", width = 9, height = 5, pointsize = 10)
par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0))
plot(case ~ date, datcase, type = "h", col = "#434F4F")
abline(v = datcase$date[datcase$date %in% c("2020-03-13", "2022-10-25")],
col = "#FF6900", lty = 2)
text(datcase$date[datcase$date == "2020-03-13"]+470, 80,
"Corona gap from 2020-03-13 to 2022-10-25",
col = "#FF6900")
dev.off()
### Other stuff
library(mvbutils)
foodweb(where = "package:mtt")
pdf("results/figures/fun_depend_mtt.pdf", width = 8, height = 4, pointsize = 10)
foodweb(where = "package:mtt",
prune = c("parse_logfiles", "create_eventlogs", "extract_artworks",
"extract_topics"),
#expand.ybox = 1.8, #cex = .6,
#border = TRUE,
#boxcolor = "gray",
color.lines = FALSE,
lwd = 2, mai = c(0, 0, 0, 0))
dev.off()
## weird behavior of timeMs
pdf("results/figures/timeMs.pdf", width = 9, height = 6, pointsize = 10)
#par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0))
#plot(timeMs.start ~ as.factor(fileId), datlogs[1:2000,], xlab = "fileId")
lattice::bwplot(timeMs.start ~ as.factor(fileId.start), datlogs[1:2000,], xlab = "",
scales = list(x = list(rot = 90), y = list(rot = 90)))
dev.off()
## x,y-coordinates out of range
set.seed(1522)
pdf("results/figures/xycoord.pdf", width = 5, height = 5, pointsize = 10)
par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0))
#par(mfrow = c(1, 2))
plot(y.start ~ x.start, datlogs[sample(nrow(datlogs), 10000), ])
abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2)
#plot(y.stop ~ x.stop, datlogs)
#abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2)
legend("bottomleft", "Random sample of 10,000", bg = "white")
legend("topleft", "4K-Display: 3840 x 2160", bg = "white")
dev.off()
## moves
dat001 <- datlogs[which(datlogs$item == "001"), ]
index <- as.numeric(as.factor(dat001$path))
cc <- sample(colors(), 100)
plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop,
length = .07, col = cc[index]))
plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray")
points(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize,
col = "blue")
cc <- sample(colors(), 70)
dat1 <- datlogs[!duplicated(datlogs$item), ]
dat1 <- dat1[order(dat1$item), ]
plot(y.start ~ x.start, dat1, type = "n", xlim = c(-100, 4500), ylim = c(-100, 2500))
abline(h = c(0, 2160), v = c(0, 3840), col = "lightgray")
with(dat1, points(x.start, y.start, col = cc, pch = 16))
with(dat1, points(x.stop, y.stop, col = cc, pch = 16))
with(dat1, arrows(x.start, y.start, x.stop, y.stop, length = .07, col = cc))
2023-11-01 18:46:39 +01:00
2023-11-15 15:54:23 +01:00
2024-01-31 12:09:27 +01:00
# How many events per topic, per path, ...
2023-11-01 18:46:39 +01:00
# How many popups per artwork?
# Number of events per artwork
tab <- xtabs( ~ item + event, datlogs)
2023-11-01 18:46:39 +01:00
addmargins(tab)
proportions(tab, margin = "item")
2023-11-01 18:46:39 +01:00
proportions(tab, margin = "event")
cc <- palette.colors(palette = "Okabe-Ito")[c(3,2,4,8)]
pdf("results/figures/event-dist.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
barplot(t(proportions(tab, margin = "item")), las = 2, col = c("#78004B", "#3CB4DC", "#91C86E", "#FF6900"),
2024-01-31 12:09:27 +01:00
legend.text = levels(datlogs$event), args.legend = list(x = "bottomleft", bg = "white"))
dev.off()
#barchart(proportions(tab, margin = "item"), las = 2)
2023-11-01 18:46:39 +01:00
# Proportion of events
2024-01-31 12:09:27 +01:00
proportions(xtabs( ~ event, datlogs))
# Mean proportion of event per path
colMeans(proportions(xtabs( ~ path + event, datlogs), margin = "path"))
# Mean proportion of event per item
colMeans(proportions(tab, margin = "item"))
2023-11-01 18:46:39 +01:00
# Proportion of unclosed events
2024-01-31 12:09:27 +01:00
nrow(datlogs[is.na(datlogs$complete), ])
nrow(datlogs[is.na(datlogs$complete), ]) / nrow(datlogs)
2023-11-01 18:46:39 +01:00
# Proportion of events spanning more than one log file
2024-01-31 12:09:27 +01:00
sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE)
sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) / nrow(datlogs)
2023-11-01 18:46:39 +01:00
#--------------- (3) Process Mining ---------------
#--------------- (3.1) Check data quality ---------------
datlogs$start <- datlogs$date.start
datlogs$complete <- datlogs$date.stop
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs,
case_id = "path",
2023-11-01 18:46:39 +01:00
activity_id = "event",
#resource_id = "case",
resource_id = "item",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
alogf <- edeaR::filter_trace_frequency(alog, percentage = 0.9)
processmapR::process_map(alogf, # alog,
type_nodes = processmapR::frequency("absolute"),
sec_nodes = processmapR::frequency("relative"),
type_edges = processmapR::frequency("absolute"),
sec_edges = processmapR::frequency("relative"),
rankdir = "TB")
alog_no_move <- alog[alog$event != "move", ]
2023-11-15 15:54:23 +01:00
pdf("../figures/traceexplore_trace-event.pdf", height = 8, width = 12, pointsize = 10)
set.seed(1447)
2024-01-31 12:09:27 +01:00
processmapR::trace_explorer(alog_no_move[alog_no_move$path %in%
sample(unique(alog_no_move$path), 400),],
coverage = 1, type = "frequent",
abbreviate = T)
2023-11-15 15:54:23 +01:00
dev.off()
pdf("../figures/ra_trace-event.pdf", height = 8, width = 12, pointsize = 10)
ra_no_move <- edeaR::resource_frequency(alog_no_move, "resource-activity")
levels(ra_no_move$event) <- c("flipCard", "flipCard", "openTopic", "openPopup")
plot(ra_no_move)
2023-11-15 15:54:23 +01:00
dev.off()
ra <- edeaR::resource_frequency(alog, "resource-activity")
plot(ra)
heatmap(xtabs(relative_activity ~ artwork + event, ra))
heatmap(xtabs(relative_resource ~ artwork + event, ra_no_move))
heatmap(xtabs(relative_activity ~ artwork + event, ra_no_move))
aggregate(relative_activity ~ event, ra, sum)
aggregate(relative_resource ~ artwork, ra, sum)
2023-11-01 18:46:39 +01:00
#--------------- (3.2) Interactions for different artworks ---------------
# Do interaction patterns for events per trace look different for different
# artworks?
2024-01-31 12:09:27 +01:00
which.max(table(datlogs$artwork))
which.min(table(datlogs$artwork))
which.min(table(datlogs$artwork)[-c(71,72)])
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
case_id = "path",
2023-11-01 18:46:39 +01:00
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
process_map(alog80, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog087 <- activitylog(datlogs[datlogs$artwork == "087",],
case_id = "path",
2023-11-01 18:46:39 +01:00
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
process_map(alog087, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog504 <- activitylog(datlogs[datlogs$artwork == "504",],
case_id = "path",
2023-11-01 18:46:39 +01:00
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
process_map(alog504, frequency("relative"))
2023-11-01 18:46:39 +01:00
#--------------- (3.3) Patterns of cases ---------------
# What kind of patterns do we have? Are their typical sequences for cases?
# Do case patterns look different for ...
# ... mornings and afternoons?
# ... weekdays and weekends?
# ... weekdays for "normal" and school vacation days?
# ... pre and post corona?
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs,
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
2023-11-15 15:54:23 +01:00
alog_no_move <- alog[alog$event != "move", ]
pdf("../figures/traceexplore_case-event.pdf", height = 8, width = 12, pointsize = 10)
set.seed(1050)
2024-01-31 12:09:27 +01:00
processmapR::trace_explorer(alog_no_move[alog_no_move$path %in%
sample(unique(alog_no_move$path), 300),],
2023-11-15 15:54:23 +01:00
coverage = 1, type = "frequent",
abbreviate = T)
dev.off()
2023-11-01 18:46:39 +01:00
process_map(alog080, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog087 <- activitylog(datlogs[datlogs$artwork == "087",],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog087, frequency("relative"))
2023-11-01 18:46:39 +01:00
### Mornings and afternoons
2024-01-31 12:09:27 +01:00
datlogs$tod <- ifelse(lubridate::hour(datlogs$start) > 13, "afternoon", "morning")
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[datlogs$tod == "morning",],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[datlogs$tod == "afternoon",],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
# Are the same artworks looked at?
pdf("../figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-01-31 12:09:27 +01:00
barplot(proportions(xtabs( ~ tod + artwork, datlogs), margin = "tod"), #col = cc[1:2],
las = 2, beside = TRUE, legend = c("afternoon", "morning"),
args.legend = list(x = "topleft"))
dev.off()
2023-11-01 18:46:39 +01:00
### Weekdays and weekends
2024-01-31 12:09:27 +01:00
datlogs$wd <- ifelse(datlogs$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday")
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[datlogs$wd == "weekend",],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[datlogs$wd == "weekday",],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
# Are the same artworks looked at?
pdf("../figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-01-31 12:09:27 +01:00
barplot(proportions(xtabs( ~ wd + artwork, datlogs), margin = "wd"),
las = 2, beside = TRUE, legend = c("weekday", "weekend"),
args.legend = list(x = "topleft"))
dev.off()
2023-11-01 18:46:39 +01:00
### Weekdays vs. school vacation weekdays
2024-01-31 12:09:27 +01:00
datlogs$wds <- ifelse(!is.na(datlogs$vacation), "vacation", "school")
datlogs$wds[datlogs$wd == "weekend"] <- NA
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[which(datlogs$wds == "school"),],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[which(datlogs$wds == "vacation"),],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
# Are the same artworks looked at?
pdf("../figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-01-31 12:09:27 +01:00
#barplot(xtabs( ~ wds + artwork, datlogs), las = 2, beside = TRUE,
barplot(proportions(xtabs( ~ wds + artwork, datlogs), margin = "wds"),
las = 2, beside = TRUE,
legend = c("school", "vacation"), args.legend = list(x = "topleft"))
dev.off()
2023-11-01 18:46:39 +01:00
### Pre and post Corona
2024-01-31 12:09:27 +01:00
datlogs$corona <- ifelse(datlogs$date < "2020-03-14", "pre", "post")
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[which(datlogs$corona == "pre"),],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[which(datlogs$corona == "post"),],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "event",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
# Are the same artworks looked at?
pdf("../figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-01-31 12:09:27 +01:00
barplot(proportions(xtabs( ~ corona + artwork, datlogs), margin = "corona"),
las = 2, beside = TRUE,
legend = c("post", "pre"), args.legend = list(x = "topleft"))
dev.off()
2023-11-01 18:46:39 +01:00
#--------------- (3.4) Artwork sequences ---------------
# Order in which artworks are looked at
nart <- 5 # select 5 artworks randomly
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs,#[datlogs$artwork %in% sample(unique(datlogs$artwork), nart), ],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "artwork",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
#map <- process_map(alog, frequency("relative"))
## select cases with Vermeer
2024-01-31 12:09:27 +01:00
length(unique(datlogs[datlogs$artwork == "080", "case"]))
2023-11-01 18:46:39 +01:00
# 12615
2024-01-31 12:09:27 +01:00
case080 <- unique(datlogs[datlogs$artwork == "080", "case"])
tmp <- datlogs[datlogs$case %in% case080, ]
2023-11-01 18:46:39 +01:00
table(tmp$artwork)
# --> all :)
# select the ones most often (I am aiming for 10...)
barplot(table(tmp$artwork))
abline(h = 14000, col = "red")
which(table(tmp$artwork) > 14000)
2023-11-01 18:46:39 +01:00
often080 <- names(which(table(tmp$artwork) > 14000))
2023-11-01 18:46:39 +01:00
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[datlogs$artwork %in% often080, ],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "artwork",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
2023-11-15 15:54:23 +01:00
pdf("../figures/traceexplore_case-artwork_often080.pdf", height = 8, width = 12, pointsize = 10)
processmapR::trace_explorer(alog,
n_traces = 30, type = "frequent",
abbreviate = TRUE)
dev.off()
2023-11-01 18:46:39 +01:00
#--------------- (3.5) Topics ---------------
# Are there certain topics that people are interested in more than others?
# Do these topic distributions differ for comparable artworks?
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[which(datlogs$event == "openTopic"),],
case_id = "case",
activity_id = "topic",
2024-01-31 12:09:27 +01:00
resource_id = "path",
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
# Order of topics for Vermeer
2024-01-31 12:09:27 +01:00
# alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
2023-11-01 18:46:39 +01:00
# case_id = "case",
# activity_id = "topic",
2024-01-31 12:09:27 +01:00
# resource_id = "path",
2023-11-01 18:46:39 +01:00
# timestamps = c("start", "complete"))
#
2024-01-31 12:09:27 +01:00
# alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
2023-11-01 18:46:39 +01:00
# case_id = "case",
# activity_id = "topicFile",
2024-01-31 12:09:27 +01:00
# resource_id = "path",
2023-11-01 18:46:39 +01:00
# timestamps = c("start", "complete"))
#
2023-11-01 18:46:39 +01:00
# #process_map(alog080, frequency("relative"))
#
2023-11-01 18:46:39 +01:00
# # Comparable artwork
2024-01-31 12:09:27 +01:00
# alog083 <- activitylog(datlogs[datlogs$artwork == "083",],
2023-11-01 18:46:39 +01:00
# case_id = "case",
# activity_id = "topic",
2024-01-31 12:09:27 +01:00
# resource_id = "path",
2023-11-01 18:46:39 +01:00
# timestamps = c("start", "complete"))
# artworks that have the same topics than Vermeer
2024-01-31 12:09:27 +01:00
which(rowSums(xtabs( ~ artwork + topic, datlogs[datlogs$topic %in%
2023-11-01 18:46:39 +01:00
c("artist", "details", "extra info", "komposition",
"licht und farbe", "thema"), ]) != 0) == 6)
#037 046 062 080 083 109
2023-11-01 18:46:39 +01:00
for (art in c("037", "046", "062", "080", "083", "109")) {
2024-01-31 12:09:27 +01:00
alog <- activitylog(datlogs[datlogs$event == "openTopic" & datlogs$artwork == art,],
2023-11-01 18:46:39 +01:00
case_id = "case",
activity_id = "topic",
2024-01-31 12:09:27 +01:00
resource_id = "path",
2023-11-01 18:46:39 +01:00
timestamps = c("start", "complete"))
process_map(alog, frequency("relative"))
2023-11-01 18:46:39 +01:00
}
# Angewandte Kunst, Graphik, Gemälde, Kultur
c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde",
"Gemälde", "Gemälde", "Graphik", "Gemälde", "Angewandte Kunst", "",
"Gemälde", "Angewandte Kunst", "", "", "Graphik", "Angewandte Kunst",
"Angewandte Kunst", "Gemälde", "Angewandte Kunst", "Gemälde", "",
"Gemälde", "Gemälde", "Gemälde", "Graphik", "Gemälde", "Gemälde",
"Gemälde", "", "Angewandte Kunst", "Angewandte Kunst", "Gemälde",
"Graphik", "Gemälde", "Gemälde", "Gemälde", "Gemälde",
"Angewandte Kunst", "Gemälde", "Gemälde", "Gemälde", "Kultur", "Kultur",
"Gemälde", "Kultur", "", "Gemälde", "", "Graphik", "Kultur", "Gemälde",
"", "Kultur", "Gemälde", "Kultur", "Gemälde", "Gemälde", "Gemälde",
"Kultur", "Kultur", "Kultur", "Kultur", "Kultur", "Kultur",
"Angewandte Kunst", "Info", "Info", "Info", "Kultur", "Kultur")
# BURSTS
2024-01-31 12:09:27 +01:00
which.max(table(datlogs$date))
tmp <- datlogs[datlogs$date == "2017-02-12", ]
2023-11-01 18:46:39 +01:00
# number of traces per case on 2017-02-12
2024-01-31 12:09:27 +01:00
rowSums(xtabs( ~ case + path, tmp) != 0)
2023-11-01 18:46:39 +01:00
range(tmp$start)
hours <- lubridate::hour(tmp$start)
xtabs( ~ case + hours, tmp)
2023-11-01 18:46:39 +01:00
# distribution of cases over the day
colSums(xtabs( ~ case + hours, tmp) != 0)
barplot(colSums(xtabs( ~ case + hours, tmp) != 0))
2024-01-31 12:09:27 +01:00
aggregate(path ~ case + hours, tmp, length)
2024-01-31 12:09:27 +01:00
tmp <- aggregate(path ~ case, datlogs, length)
tmp$date <- as.Date(datlogs[!duplicated(datlogs$case), "start"])
tmp$time <- lubridate::hour(datlogs[!duplicated(datlogs$case), "start"])
2024-01-31 12:09:27 +01:00
tmp[tmp$path > 200, ]
2024-01-31 12:09:27 +01:00
plot(path ~ time, tmp, cex = 2, col = rgb(0,0,0,.3))
2024-01-31 12:09:27 +01:00
lattice::barchart(path ~ time, tmp, horizontal=F)
2023-11-01 18:46:39 +01:00