Tried out current state of mtt; parsing for 8o8m still not working correctly

This commit is contained in:
Nora Wickelmaier 2023-11-01 18:48:14 +01:00
parent d4e9676dd3
commit fb448f01a9
3 changed files with 47 additions and 49 deletions

View File

@ -10,17 +10,26 @@ folders <- dir("../data/8o8m/LogFiles/")
# parse raw log files
datraw <- parse_logfiles(folders, path = "../data/8o8m/LogFiles/")
artworks <- unique(na.omit(datraw$artwork))
#artworks <- unique(na.omit(datraw$artwork))
# export data
write.table(datraw, paste0("../data/8o8m/raw_logfiles_", now, ".csv"),
sep = ";", row.names = FALSE)
datraw2 <- datraw[!is.na(datraw$artwork), ]
#datraw[is.na(datraw$artwork), ]
datraw <- datraw[!is.na(datraw$artwork), ]
# TODO: Why is this happening?
# convert to log events
datlogs <- create_eventlogs(datraw2, xmlpath = "../data/8o8m/Content8o8m/")
datlogs <- create_eventlogs(datraw, xmlpath = "../data/8o8m/Content8o8m/")
artworks <- unique(datlogs$artwork)
topics <- extract_topics(artworks, xmlfiles = paste0(artworks, "_en.xml"),
xmlpath = "../data/8o8m/Content8o8m/")
# TODO: What is wrong with the German XML files that the topics are
# extracted like this? (It works fine for the English versions...)
# export data
write.table(datlogs, paste0("../data/8o8m/event_logfiles_", now, ".csv"),

View File

@ -24,20 +24,25 @@ now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
#--------------- (1) Parse raw log files ---------------
path <- "../data/haum/LogFiles/"
folders <- dir(path)
#path <- "../data/haum/LogFiles/"
#folders <- dir(path)
#folders <- "2016"
datraw <- parse_logfiles(folders, path)
#datraw <- parse_logfiles(folders, path)
datraw <- read.table("../data/haum/raw_logfiles_2023-10-25_16-20-45.csv",
sep = ";", header = TRUE)
## Export data
write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"),
sep = ";", row.names = FALSE)
#write.table(datraw, paste0("../data/haum/raw_logfiles_small_", now, ".csv"),
# sep = ";", row.names = FALSE)
#--------------- (2) Create event logs ---------------
datlogs <- create_eventlogs(datraw,
xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
glossar = TRUE)
artworks <- unique(na.omit(datlogs$artwork))
topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"),
@ -95,12 +100,14 @@ for (i in seq_len(nrow(sf))) {
## Merge data
dat1 <- merge(datlogs_topics, hd, by.x = "date.start", by.y = "date", all.x = TRUE)
dat2 <- merge(dat1, sfdat, by.x = "date.start", by.y = "date", all.x = TRUE)
datlogs_topics$date <- as.Date(datlogs_topics$date.start)
dat1 <- merge(datlogs_topics, hd, by.x = "date", by.y = "date", all.x = TRUE)
dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)
## Export data
write.table(dat2, paste0("../data/haum/event_logfiles_", now, ".csv"),
write.table(dat2, paste0("../data/haum/event_logfiles_glossar_", now, ".csv"),
sep = ";", row.names = FALSE)
# TODO: Maybe add infos about artworks?

View File

@ -2,22 +2,14 @@
# Read data
# dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv",
# sep = ";", header = TRUE)
dat0 <- read.table("../data/haum/event_logfiles_small_metadata_2023-10-19_18-25-26.csv",
dat0 <- read.table("../data/haum/event_logfiles_2023-10-25_17-29-52.csv",
sep = ";", header = TRUE)
dat0$date <- as.Date(dat0$date)
dat0$date.start <- as.POSIXct(dat0$date.start)
dat0$date.stop <- as.POSIXct(dat0$date.stop)
dat0$artwork <- sprintf("%03d", dat0$artwork)
# TODO: Write a function that closes events spanning different log files
# OR: Remove openTopic and OpenPopup events that do not start with a
# flipCard (AND openPopup events without openTopic event beforehand)
table(dat0[!duplicated(dat0$trace), "event"])
# flipCard move openPopup openTopic
# 45528 247718 981 3457
proportions(table(dat0[!duplicated(dat0$trace), "event"]))
tmp <- dat0[!duplicated(dat0$trace) & dat0$event %in% c("openTopic",
@ -48,38 +40,28 @@ table(table(dat$start))
summary(aggregate(duration ~ trace, dat, mean))
# remove fragmented traces
tab <- xtabs( ~ trace + event, dat)
fragments <- NULL
for (i in seq_len(nrow(tab))) {
if (tab[i, "openPopup"] != 0 & tab[i, "flipCard"] == 0) {
fragments <- c(fragments, rownames(tab)[i])
} else if (tab[i, "openTopic"] != 0 & tab[i, "flipCard"] == 0) {
fragments <- c(fragments, rownames(tab)[i])
} else if (tab[i, "openPopup"] != 0 & tab[i, "openTopic"] == 0) {
fragments <- c(fragments, rownames(tab)[i])
}
}
datrm <- dat[!dat$trace %in% fragments, ]
# TODO: Find trace that has flipCard --> openPopup --> openTopic
dato <- datrm[datrm$event != "move", ]
tmp <- lapply(unique(dato$trace), function(x) unique(dato[dato$trace == x, "event"]))
names(tmp) <- unique(dato$trace)
dato <- dat[dat$event != "move", ]
dato_split <- split(dato, ~ trace)
tmp <- lapply(dato_split, function(x) unique(x$event))
#tmp <- lapply(unique(dato$trace), function(x) unique(dato[dato$trace == x, "event"]))
ids <- sapply(tmp, length) == 3
do.call(rbind, tmp[ids])
tmp2 <- as.data.frame(do.call(rbind, tmp[ids]))
names(tmp2) <- c("flipCard", "openTopic", "openPopup")
# TODO:
# fragmentary traces: for 4591 openTopic for topic 1 is in the raw log files, but gets
# probably removed in close_events(); how can I prevent that? How can I fix
# the traces and eventIds that do not match correctly ???
ct <- c(4591, 5937, 7080, 8412, 8279)
datrm[datrm$trace %in% ct, 1:10]
table(tmp2$flipCard)
table(tmp2$openTopic)
table(tmp2$openPopup)
frag_ids <- which(tmp2$openTopic == "openPopup")
tmp3 <- dat[dat$trace %in% rownames(tmp2)[frag_ids], ]
tmp4 <- tmp3[!tmp3$glossar == 1, ]
dat6 <- rbind(dat[!dat$trace %in% rownames(tmp2)[frag_ids], ], tmp4)
### WHY?????
alog <- activitylog(dat,
case_id = "trace",