From fb448f01a9162e21698ea11a1050d1226731e919 Mon Sep 17 00:00:00 2001 From: nwickel Date: Wed, 1 Nov 2023 18:48:14 +0100 Subject: [PATCH] Tried out current state of mtt; parsing for 8o8m still not working correctly --- code/01_preprocessing_8o8m.R | 15 ++++++++-- code/01_preprocessing_haum.R | 25 ++++++++++------ code/04_modeling_haum.R | 56 ++++++++++++------------------------ 3 files changed, 47 insertions(+), 49 deletions(-) diff --git a/code/01_preprocessing_8o8m.R b/code/01_preprocessing_8o8m.R index 5b2509c..e9690b9 100644 --- a/code/01_preprocessing_8o8m.R +++ b/code/01_preprocessing_8o8m.R @@ -10,17 +10,26 @@ folders <- dir("../data/8o8m/LogFiles/") # parse raw log files datraw <- parse_logfiles(folders, path = "../data/8o8m/LogFiles/") -artworks <- unique(na.omit(datraw$artwork)) +#artworks <- unique(na.omit(datraw$artwork)) # export data write.table(datraw, paste0("../data/8o8m/raw_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) -datraw2 <- datraw[!is.na(datraw$artwork), ] +#datraw[is.na(datraw$artwork), ] +datraw <- datraw[!is.na(datraw$artwork), ] # TODO: Why is this happening? # convert to log events -datlogs <- create_eventlogs(datraw2, xmlpath = "../data/8o8m/Content8o8m/") +datlogs <- create_eventlogs(datraw, xmlpath = "../data/8o8m/Content8o8m/") + +artworks <- unique(datlogs$artwork) + + +topics <- extract_topics(artworks, xmlfiles = paste0(artworks, "_en.xml"), + xmlpath = "../data/8o8m/Content8o8m/") +# TODO: What is wrong with the German XML files that the topics are +# extracted like this? (It works fine for the English versions...) # export data write.table(datlogs, paste0("../data/8o8m/event_logfiles_", now, ".csv"), diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing_haum.R index 8c31476..ea8e5f3 100644 --- a/code/01_preprocessing_haum.R +++ b/code/01_preprocessing_haum.R @@ -24,20 +24,25 @@ now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") #--------------- (1) Parse raw log files --------------- -path <- "../data/haum/LogFiles/" -folders <- dir(path) +#path <- "../data/haum/LogFiles/" +#folders <- dir(path) +#folders <- "2016" -datraw <- parse_logfiles(folders, path) +#datraw <- parse_logfiles(folders, path) + +datraw <- read.table("../data/haum/raw_logfiles_2023-10-25_16-20-45.csv", + sep = ";", header = TRUE) ## Export data -write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"), - sep = ";", row.names = FALSE) +#write.table(datraw, paste0("../data/haum/raw_logfiles_small_", now, ".csv"), +# sep = ";", row.names = FALSE) #--------------- (2) Create event logs --------------- datlogs <- create_eventlogs(datraw, - xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") + xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/", + glossar = TRUE) artworks <- unique(na.omit(datlogs$artwork)) topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"), @@ -95,12 +100,14 @@ for (i in seq_len(nrow(sf))) { ## Merge data -dat1 <- merge(datlogs_topics, hd, by.x = "date.start", by.y = "date", all.x = TRUE) -dat2 <- merge(dat1, sfdat, by.x = "date.start", by.y = "date", all.x = TRUE) +datlogs_topics$date <- as.Date(datlogs_topics$date.start) + +dat1 <- merge(datlogs_topics, hd, by.x = "date", by.y = "date", all.x = TRUE) +dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE) ## Export data -write.table(dat2, paste0("../data/haum/event_logfiles_", now, ".csv"), +write.table(dat2, paste0("../data/haum/event_logfiles_glossar_", now, ".csv"), sep = ";", row.names = FALSE) # TODO: Maybe add infos about artworks? diff --git a/code/04_modeling_haum.R b/code/04_modeling_haum.R index d38b4a9..6cd676d 100644 --- a/code/04_modeling_haum.R +++ b/code/04_modeling_haum.R @@ -2,22 +2,14 @@ # Read data -# dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", -# sep = ";", header = TRUE) -dat0 <- read.table("../data/haum/event_logfiles_small_metadata_2023-10-19_18-25-26.csv", +dat0 <- read.table("../data/haum/event_logfiles_2023-10-25_17-29-52.csv", sep = ";", header = TRUE) -dat0$date <- as.Date(dat0$date) dat0$date.start <- as.POSIXct(dat0$date.start) dat0$date.stop <- as.POSIXct(dat0$date.stop) dat0$artwork <- sprintf("%03d", dat0$artwork) -# TODO: Write a function that closes events spanning different log files -# OR: Remove openTopic and OpenPopup events that do not start with a -# flipCard (AND openPopup events without openTopic event beforehand) - table(dat0[!duplicated(dat0$trace), "event"]) -# flipCard move openPopup openTopic -# 45528 247718 981 3457 + proportions(table(dat0[!duplicated(dat0$trace), "event"])) tmp <- dat0[!duplicated(dat0$trace) & dat0$event %in% c("openTopic", @@ -48,38 +40,28 @@ table(table(dat$start)) summary(aggregate(duration ~ trace, dat, mean)) -# remove fragmented traces -tab <- xtabs( ~ trace + event, dat) - -fragments <- NULL - -for (i in seq_len(nrow(tab))) { - if (tab[i, "openPopup"] != 0 & tab[i, "flipCard"] == 0) { - fragments <- c(fragments, rownames(tab)[i]) - } else if (tab[i, "openTopic"] != 0 & tab[i, "flipCard"] == 0) { - fragments <- c(fragments, rownames(tab)[i]) - } else if (tab[i, "openPopup"] != 0 & tab[i, "openTopic"] == 0) { - fragments <- c(fragments, rownames(tab)[i]) - } -} -datrm <- dat[!dat$trace %in% fragments, ] - # TODO: Find trace that has flipCard --> openPopup --> openTopic -dato <- datrm[datrm$event != "move", ] -tmp <- lapply(unique(dato$trace), function(x) unique(dato[dato$trace == x, "event"])) -names(tmp) <- unique(dato$trace) +dato <- dat[dat$event != "move", ] +dato_split <- split(dato, ~ trace) +tmp <- lapply(dato_split, function(x) unique(x$event)) +#tmp <- lapply(unique(dato$trace), function(x) unique(dato[dato$trace == x, "event"])) ids <- sapply(tmp, length) == 3 -do.call(rbind, tmp[ids]) +tmp2 <- as.data.frame(do.call(rbind, tmp[ids])) +names(tmp2) <- c("flipCard", "openTopic", "openPopup") -# TODO: -# fragmentary traces: for 4591 openTopic for topic 1 is in the raw log files, but gets -# probably removed in close_events(); how can I prevent that? How can I fix -# the traces and eventIds that do not match correctly ??? -ct <- c(4591, 5937, 7080, 8412, 8279) -datrm[datrm$trace %in% ct, 1:10] +table(tmp2$flipCard) +table(tmp2$openTopic) +table(tmp2$openPopup) + +frag_ids <- which(tmp2$openTopic == "openPopup") + +tmp3 <- dat[dat$trace %in% rownames(tmp2)[frag_ids], ] + +tmp4 <- tmp3[!tmp3$glossar == 1, ] + +dat6 <- rbind(dat[!dat$trace %in% rownames(tmp2)[frag_ids], ], tmp4) -### WHY????? alog <- activitylog(dat, case_id = "trace",