596 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			596 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
| #' ---
 | |
| #' title: "Preprocessing log files"
 | |
| #' author: "Nora Wickelmaier"
 | |
| #' date: "`r Sys.Date()`"
 | |
| #' output:
 | |
| #'   html_document:
 | |
| #'     toc: true
 | |
| #'     toc_float: true
 | |
| #'   pdf_document:
 | |
| #'     toc: true
 | |
| #'     number_sections: true
 | |
| #' geometry: margin = 2.5cm
 | |
| #' ---
 | |
| 
 | |
| # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 | |
| 
 | |
| # LogEntry classes:
 | |
| #   TRANSFORM_START:    "Transform start" --> "Transformation Start" in Tool
 | |
| #   TRANSFORM_STOP:     "Transform stop"
 | |
| #   START_APPLICATION:  "Start Application"
 | |
| #   SHOW_APPLICATION:   "Show Application"
 | |
| #   SHOW_INFO:          "Show Info"       --> "Flip Card" in Tool
 | |
| #   SHOW_FRONT:         "Show Front"
 | |
| #   SHOW_POPUP:         "ShowPopup"       --> "Show Popup" in Tool
 | |
| #   HIDE_POPUP:         "HidePopup"
 | |
| #   ARTWORK:            "Artwork"         --> "Show Topic" in Tool
 | |
| 
 | |
| #' # Read data
 | |
| 
 | |
| dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
 | |
|                    header = TRUE)
 | |
| dat0$date <- as.POSIXct(dat0$date)  # create date object
 | |
| dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
 | |
| 
 | |
| #' # Remove irrelevant events
 | |
| 
 | |
| #' ## Remove Start Application and Show Application
 | |
| 
 | |
| dat <- subset(dat0, !(dat0$event %in% c("Start Application",
 | |
|                                         "Show Application")))
 | |
| rownames(dat) <- NULL
 | |
| 
 | |
| #' # Close events
 | |
| 
 | |
| ########
 | |
| 
 | |
| #' Do it for Transform events first
 | |
| dat1 <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
 | |
| dat1 <- dat1[order(dat1$artwork, dat1$date), ]
 | |
| rownames(dat1) <- NULL
 | |
| 
 | |
| # Create event ID for closing events
 | |
| num_start <- diff(c(0, which(dat1$event == "Transform stop")))
 | |
| dat1$eventid <- rep(seq_along(num_start), num_start)
 | |
| head(dat1[, c("event", "eventid")], 25)
 | |
| 
 | |
| table(table(dat1$eventid))
 | |
| #   1     2     3     4     5     6     7     8    10    11
 | |
| #  70 78435  5153   842   222    66    18    14     3     1 
 | |
| # --> compare to table(num_start)!
 | |
| 
 | |
| # Find out how often "Transform stop" follows each other
 | |
| num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
 | |
| table(num_stop)
 | |
| 
 | |
| dat1$eventrep <- rep(num_start, num_start)
 | |
| dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
 | |
| dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
 | |
| dat1[dat1$eventrep == 10, ]
 | |
| 
 | |
| dat1$dupl <- NULL
 | |
| dat1$eventrep <- NULL
 | |
| 
 | |
| 
 | |
| # remove duplicated "Transform start" events
 | |
| dat1 <- dat1[!duplicated(dat1[, c("event", "eventid")]), ]
 | |
| 
 | |
| # remove duplicated "Transform stop" events
 | |
| id_stop  <- which(dat1$event == "Transform stop")
 | |
| id_rm_stop <- id_stop[diff(id_stop) == 1]
 | |
| 
 | |
| dat1 <- dat1[-(id_rm_stop + 1), ]
 | |
| 
 | |
| # transform to wide data format
 | |
| dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
 | |
| 
 | |
| trans_wide <- reshape(dat1, direction = "wide",
 | |
|                       idvar = c("eventid", "artwork", "glossar"),
 | |
|                       timevar = "time",
 | |
|                       drop = c("popup", "card", "event")
 | |
| )
 | |
| # TODO: This runs for quite some time
 | |
| # --> Is this more efficient with tidyr::pivot_wider?
 | |
| 
 | |
| # --> when fileid is part of the reshape, it does not work correctly, since
 | |
| # we sometimes have a start - stop event that is recorded in two separate
 | |
| # log files, BUT: after finding out, that `time_ms` changes for each log
 | |
| # file, I want to exclude those cases, so `fileid` has to be included!!!
 | |
| 
 | |
| # check how often an eventid is associated with two fileids
 | |
| nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
 | |
| 
 | |
| # exclude from data set ??
 | |
| # trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
 | |
| 
 | |
| # which(is.na(trans_wide$date.start))
 | |
| 
 | |
| trans_wide$event <- "move"
 | |
| trans_wide$eventid <- NULL
 | |
| 
 | |
| rownames(trans_wide) <- NULL
 | |
| 
 | |
| trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
 | |
| #trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start
 | |
| # only seconds - not fine grained enough
 | |
| trans_wide$distance <- apply(
 | |
|       trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
 | |
|       function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
 | |
| trans_wide$rotationDegree <- trans_wide$rotation.stop -
 | |
|   trans_wide$rotation.start
 | |
| trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start
 | |
| 
 | |
| trans_wide$trace <- NA
 | |
| trans_wide$card  <- NA
 | |
| trans_wide$popup <- NA
 | |
| 
 | |
| dat_trans <- trans_wide[trans_wide$distance != 0 &
 | |
|                         trans_wide$rotationDegree != 0 &
 | |
|                         trans_wide$scaleSize != 1,
 | |
|                       c("fileid.start", "fileid.stop", "event", "artwork",
 | |
|                         "trace", "glossar", "date.start", "date.stop",
 | |
|                         "time_ms.start", "time_ms.stop", "duration",
 | |
|                         "card", "popup", "x.start", "y.start", "x.stop",
 | |
|                         "y.stop", "distance", "scale.start", "scale.stop",
 | |
|                         "scaleSize", "rotation.start", "rotation.stop",
 | |
|                         "rotationDegree")]
 | |
| 1 - nrow(dat_trans) / nrow(trans_wide)
 | |
| # removes almost 2/3 of the data (for small data set)
 | |
| 
 | |
| rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop)
 | |
| 
 | |
| summary(dat_trans)
 | |
| 
 | |
| 
 | |
| #' # Close other events
 | |
| 
 | |
| dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
 | |
| # dat2$x <- NULL
 | |
| # dat2$y <- NULL
 | |
| # dat2$scale <- NULL
 | |
| # dat2$rotation <- NULL
 | |
| rownames(dat2) <- NULL
 | |
| 
 | |
| # Create event ID for closing events
 | |
| # num_start <- diff(c(0, which(dat2$event == "Show Front")))
 | |
| # dat2$trace <- rep(seq_along(num_start), num_start)
 | |
| # head(dat2[, c("artwork", "event", "trace")], 50)
 | |
| # --> does not work because of glossar entries... can't sort by artwork
 | |
| 
 | |
| 
 | |
| dat2$trace <- NA
 | |
| last_event <- dat2$event[1]
 | |
| aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
 | |
| #
 | |
| for (art in aws) {              # select artwork
 | |
| 
 | |
|   for (i in 1:nrow(dat2)) {      # go through rows
 | |
| 
 | |
|     if (last_event == "Show Info" & dat2$artwork[i] == art) {
 | |
|       dat2$trace[i] <- i
 | |
|       j <- i
 | |
| 
 | |
|     } else if (last_event == "Show Front" & dat2$artwork[i] == art) {
 | |
|       dat2$trace[i] <- j
 | |
| 
 | |
|     } else if (!(last_event %in% c("Show Info", "Show Front")) &
 | |
|                dat2$artwork[i] == art) {
 | |
|       dat2$trace[i] <- j
 | |
|     }
 | |
| 
 | |
|     if (i <= nrow(dat2)) {
 | |
|       last_event <- dat2$event[i + 1]
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| head(dat2[, c("artwork", "event", "trace")], 50)
 | |
| tail(dat2[, c("artwork", "event", "trace")], 50)
 | |
| 
 | |
| rm(aws, i, j, last_event, art)
 | |
| 
 | |
| ## Fix glossar entries
 | |
| 
 | |
| ### Find artwork for glossar entry
 | |
| 
 | |
| glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
 | |
| 
 | |
| # load lookup table for artworks and glossar files
 | |
| load("../data/glossar_dict.RData")
 | |
| lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
 | |
| 
 | |
| head(dat2[, c("artwork", "event", "popup", "trace")], 20)
 | |
| 
 | |
| 
 | |
| for (file in lut$glossar_file) {
 | |
| 
 | |
|   artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
 | |
| 
 | |
|   for (i in seq_len(nrow(dat2))) {
 | |
| 
 | |
|     if (dat2$event[i] == "Show Info") {
 | |
| 
 | |
|       current_artwork <- dat2[i, "artwork"]
 | |
|       j <- i
 | |
|       k <- i
 | |
| 
 | |
|     } else {
 | |
| 
 | |
|       current_artwork <- current_artwork
 | |
| 
 | |
|     }
 | |
| 
 | |
|     if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) {
 | |
|     # make sure artwork has not been closed, yet!
 | |
|       k <- i
 | |
|     }
 | |
| 
 | |
|     if (dat2$artwork[i] == "glossar" &
 | |
|         (current_artwork %in% artwork_list) &
 | |
|         dat2$popup[i] == file & (j-k == 0)) {
 | |
| 
 | |
|       dat2[i, "trace"] <- dat2[j, "trace"]
 | |
|       dat2[i, "artwork"] <- current_artwork
 | |
| 
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| # --> finds about half of the glossar entries for the small data set...
 | |
| table(is.na(dat2[dat2$glossar == 1, "trace"]))
 | |
| 
 | |
| # REMEMBER: It can never bo 100% correct, since it is always possible that
 | |
| # several cards are open and that they link to the same glossar entry
 | |
| 
 | |
| # How many glossar_files are only associated with one artwork?
 | |
| lut[sapply(lut$artwork, length) == 1, "glossar_file"]
 | |
| 
 | |
| single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"]
 | |
| tmp <- subset(dat2, is.na(dat2$trace))$popup
 | |
| inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]])
 | |
| single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
 | |
| tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art)
 | |
| 
 | |
| 
 | |
| dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")]
 | |
| 
 | |
| for (file in tmp_lut$glossar_file) {
 | |
| 
 | |
|   for (i in seq_len(nrow(dat2))) {
 | |
| 
 | |
|     if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) {
 | |
| 
 | |
|       current_artwork <- dat2[i, "artwork"]
 | |
|       j <- i
 | |
| 
 | |
|     }
 | |
| 
 | |
|     if (dat2$artwork[i] == "glossar" &
 | |
|         dat2$popup[i] == file) {
 | |
| 
 | |
|       dat2[i, "trace"] <- dat2[j, "trace"]
 | |
|       dat2[i, "artwork"] <- current_artwork
 | |
| 
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| dat2[14110:14130, ]
 | |
| 
 | |
| # TODO: Integrate for loop into for loop above
 | |
| 
 | |
| # TODO: For now: Exclude not matched glossar entries
 | |
| 
 | |
| df <- subset(dat2, !is.na(dat2$trace))
 | |
| df <- df[order(df$trace), ]
 | |
| rownames(df) <- NULL
 | |
| 
 | |
| rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
 | |
|    glossar_files)
 | |
| 
 | |
| #' ## Close flipCard
 | |
| 
 | |
| dat3 <- subset(df, df$event %in% c("Show Info", "Show Front"))
 | |
| 
 | |
| dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop")
 | |
| 
 | |
| flipCard_wide <- reshape(dat3, direction = "wide",
 | |
|                         idvar = c("trace", "artwork", "glossar"),
 | |
|                         timevar = "time",
 | |
|                         drop = c("popup", "card"))
 | |
| flipCard_wide$event <- "flipCard"
 | |
| flipCard_wide$duration <- flipCard_wide$time_ms.stop -
 | |
|   flipCard_wide$time_ms.start
 | |
| 
 | |
| 
 | |
| flipCard_wide$card            <- NA
 | |
| flipCard_wide$popup           <- NA
 | |
| flipCard_wide$x.start         <- NA
 | |
| flipCard_wide$x.stop          <- NA
 | |
| flipCard_wide$y.start         <- NA
 | |
| flipCard_wide$y.stop          <- NA
 | |
| flipCard_wide$distance        <- NA
 | |
| flipCard_wide$scale.start     <- NA
 | |
| flipCard_wide$scale.stop      <- NA
 | |
| flipCard_wide$scaleSize       <- NA
 | |
| flipCard_wide$rotation.start  <- NA
 | |
| flipCard_wide$rotation.stop   <- NA
 | |
| flipCard_wide$rotationDegree  <- NA
 | |
| 
 | |
| dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
 | |
|                                   "artwork", "trace", "glossar",
 | |
|                                   "date.start", "date.stop",
 | |
|                                   "time_ms.start", "time_ms.stop",
 | |
|                                   "duration", "card", "popup", "x.start",
 | |
|                                   "y.start", "x.stop", "y.stop",
 | |
|                                   "distance", "scale.start", "scale.stop",
 | |
|                                   "scaleSize", "rotation.start",
 | |
|                                   "rotation.stop", "rotationDegree")]
 | |
| 
 | |
| rm(flipCard_wide)
 | |
| 
 | |
| #' ## Close openTopic
 | |
| 
 | |
| dat4 <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
 | |
| dat4 <- dat4[order(dat4$artwork, dat4$date), ]
 | |
| rownames(dat4) <- NULL
 | |
| 
 | |
| num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard")))
 | |
| dat4$eventid <- rep(seq_along(num_start), num_start)
 | |
| 
 | |
| dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")
 | |
| 
 | |
| openTopic_wide <- reshape(dat4, direction = "wide",
 | |
|                          idvar = c("eventid", "trace", "glossar", "artwork", "card"),
 | |
|                          timevar = "time", drop = "popup")
 | |
| openTopic_wide$event <- "openTopic"
 | |
| openTopic_wide$duration <- openTopic_wide$time_ms.stop -
 | |
|   openTopic_wide$time_ms.start
 | |
| 
 | |
| 
 | |
| openTopic_wide$popup           <- NA
 | |
| openTopic_wide$x.start         <- NA
 | |
| openTopic_wide$x.stop          <- NA
 | |
| openTopic_wide$y.start         <- NA
 | |
| openTopic_wide$y.stop          <- NA
 | |
| openTopic_wide$distance        <- NA
 | |
| openTopic_wide$scale.start     <- NA
 | |
| openTopic_wide$scale.stop      <- NA
 | |
| openTopic_wide$scaleSize       <- NA
 | |
| openTopic_wide$rotation.start  <- NA
 | |
| openTopic_wide$rotation.stop   <- NA
 | |
| openTopic_wide$rotationDegree  <- NA
 | |
| 
 | |
| dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
 | |
|                                     "artwork", "trace", "glossar",
 | |
|                                     "date.start", "date.stop",
 | |
|                                     "time_ms.start", "time_ms.stop",
 | |
|                                     "duration", "card", "popup", "x.start",
 | |
|                                     "y.start", "x.stop", "y.stop",
 | |
|                                     "distance", "scale.start",
 | |
|                                     "scale.stop", "scaleSize",
 | |
|                                     "rotation.start", "rotation.stop",
 | |
|                                     "rotationDegree")]
 | |
| # TODO: card should have a unique identifier for each artwork
 | |
| 
 | |
| rm(openTopic_wide, num_start)
 | |
| 
 | |
| #' ## close openPopup
 | |
| dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
 | |
| dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ]
 | |
| rownames(dat5) <- NULL
 | |
| 
 | |
| num_start <- diff(c(0, which(dat5$event == "HidePopup")))
 | |
| # last event is "ShowPopup"! Needs to be fixed
 | |
| # num_start <- c(num_start, 1)
 | |
| # TODO: Needs to be caught in a function --> not anymore - still relevant???
 | |
| 
 | |
| dat5$eventid <- rep(seq_along(num_start), num_start)
 | |
| 
 | |
| dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop")
 | |
| 
 | |
| openPopup_wide <- reshape(dat5, direction = "wide",
 | |
|                          idvar = c("eventid", "trace", "glossar", "artwork", "popup"),
 | |
|                          timevar = "time", drop = "card")
 | |
| # there is a pathological entry which gets deleted...
 | |
| # df[df$trace == 4595, ]
 | |
| # --> artwork 046 popup selene.xml gets opened twice
 | |
| 
 | |
| 
 | |
| openPopup_wide$event <- "openPopup"
 | |
| openPopup_wide$duration <- openPopup_wide$time_ms.stop -
 | |
|   openPopup_wide$time_ms.start
 | |
| 
 | |
| openPopup_wide$card            <- NA
 | |
| openPopup_wide$x.start         <- NA
 | |
| openPopup_wide$x.stop          <- NA
 | |
| openPopup_wide$y.start         <- NA
 | |
| openPopup_wide$y.stop          <- NA
 | |
| openPopup_wide$distance        <- NA
 | |
| openPopup_wide$scale.start     <- NA
 | |
| openPopup_wide$scale.stop      <- NA
 | |
| openPopup_wide$scaleSize       <- NA
 | |
| openPopup_wide$rotation.start  <- NA
 | |
| openPopup_wide$rotation.stop   <- NA
 | |
| openPopup_wide$rotationDegree  <- NA
 | |
| 
 | |
| dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
 | |
|                                     "artwork", "trace", "glossar",
 | |
|                                     "date.start", "date.stop",
 | |
|                                     "time_ms.start", "time_ms.stop",
 | |
|                                     "duration", "card", "popup", "x.start",
 | |
|                                     "y.start", "x.stop", "y.stop",
 | |
|                                     "distance", "scale.start",
 | |
|                                     "scale.stop", "scaleSize",
 | |
|                                     "rotation.start", "rotation.stop",
 | |
|                                     "rotationDegree")]
 | |
| rm(num_start, openPopup_wide)
 | |
| 
 | |
| 
 | |
| # TODO: Should card maybe also be filled in for "openPopup"?
 | |
| 
 | |
| #' ## Use `rbind()` instead...
 | |
| # --> unbeatable in terms of time!
 | |
| 
 | |
| dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)
 | |
| 
 | |
| # check
 | |
| nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
 | |
|                   nrow(dat_openTopic) + nrow(dat_openPopup))
 | |
| 
 | |
| # remove all events that do not have a `date.start`
 | |
| dim(dat_all[is.na(dat_all$date.start), ])
 | |
| dat_all <- dat_all[!is.na(dat_all$date.start), ]
 | |
| # There is only a `date.stop`, when event is not properly closed, see here:
 | |
| df[df$trace == 1843, ]
 | |
| dat_openPopup[dat_openPopup$trace == 1843, ]
 | |
| ## --> still 50 (small data set) left, and some really do not seem to be
 | |
| ## opened! Must be a log error
 | |
| # --> others should be closed!
 | |
| dat[31000:31019,]     # this one e.g.
 | |
| # --> Actually NOT! card gets flipped before! Again - log error!
 | |
| 
 | |
| 
 | |
| # Remove durations when event spans more than one log file, since they are
 | |
| # not interpretable
 | |
| dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA
 | |
| 
 | |
| 
 | |
| # sort by `start.date`
 | |
| dat_all <- dat_all[order(dat_all$date.start), ]
 | |
| rownames(dat_all) <- NULL
 | |
| 
 | |
| ind <- rowSums(is.na(dat_all)) == ncol(dat_all)
 | |
| any(ind)
 | |
| dat_all[ind, ]
 | |
| # --> No rows with only NA, as it should be.
 | |
| 
 | |
| summary(dat_all)    # OK, this actually makes a lot of sense :)
 | |
| 
 | |
| #' ## Create case variable
 | |
| 
 | |
| #dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start)
 | |
| 
 | |
| dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start)))
 | |
| 
 | |
| hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50)
 | |
| 
 | |
| 
 | |
| # TODO: What is the best choice for the cutoff here? I took 20 secs for now
 | |
| dat_all$case <- NA
 | |
| j <- 1
 | |
| 
 | |
| for (i in seq_len(nrow(dat_all))) {
 | |
|   if (dat_all$timediff[i] < 21) {
 | |
|     dat_all$case[i] <- j
 | |
|   } else {
 | |
|     j <- j + 1
 | |
|     dat_all$case[i] <- j
 | |
|   }
 | |
| }
 | |
| 
 | |
| head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100)
 | |
| 
 | |
| #' ## Add event ID
 | |
| 
 | |
| dat_all$eventid <- seq_len(nrow(dat_all))
 | |
| 
 | |
| dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case",
 | |
|                        "trace", "glossar", "event", "artwork",
 | |
|                        "date.start", "date.stop", "time_ms.start",
 | |
|                        "time_ms.stop", "duration", "card", "popup",
 | |
|                        "x.start", "y.start", "x.stop", "y.stop",
 | |
|                        "distance", "scale.start", "scale.stop",
 | |
|                        "scaleSize", "rotation.start", "rotation.stop",
 | |
|                        "rotationDegree")]
 | |
| 
 | |
| #' ## Add `trace` numbers for `move` events
 | |
| 
 | |
| # when case and artwork are identical and there is only 1 trace value
 | |
| # --> assign it to all `move` events for that case and artwork
 | |
| # when case and artwork are identical and there is more than 1 trace value
 | |
| # --> assign the `trace` value that was right before this `move` event
 | |
| # (could, of course, also be after)
 | |
| 
 | |
| cases <- unique(dat_all$case)
 | |
| aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"]
 | |
| max_trace <- max(dat_all$trace, na.rm = TRUE) + 1
 | |
| out <- NULL
 | |
| 
 | |
| for (case in cases) {
 | |
|   for (art in aws) {
 | |
|     tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ]
 | |
|     if (nrow(tmp) != 0) {
 | |
| 
 | |
|       if (length(na.omit(unique(tmp$trace))) == 1) {
 | |
|         tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
 | |
|       } else if (length(na.omit(unique(tmp$trace))) > 1) {
 | |
|         for (i in 1:nrow(tmp)) {
 | |
|           if (tmp$event[i] == "move") {
 | |
|             if (i == 1) {
 | |
|               tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
 | |
|             } else {
 | |
|               tmp$trace[i] <- tmp$trace[i - 1]
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       } else if (all(is.na(tmp$trace))) {
 | |
|         for (i in 1:nrow(tmp)) {
 | |
|           if (tmp$event[i] == "move") {
 | |
|             tmp$trace[i] <- max_trace
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|       max_trace <- max_trace + 1
 | |
|     }
 | |
|     if (nrow(tmp) > 0) {
 | |
|       #print(tmp[, c("case", "event", "trace", "artwork")])
 | |
|       out <- rbind(out, tmp)
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| # TODO: Get rid of the loops
 | |
| # --> This takes forever...
 | |
| 
 | |
| #head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55)
 | |
| 
 | |
| #head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50)
 | |
| 
 | |
| # identical(dat_all[which(!dat_all$eventid %in% out$eventid), ],
 | |
| #           dat_all[dat_all$artwork == "glossar", ])
 | |
| # --> TRUE
 | |
| 
 | |
| # put glossar events back in
 | |
| 
 | |
| #dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
 | |
| out <- out[order(out$date.start), ]
 | |
| rownames(out) <- NULL
 | |
| 
 | |
| # Make `trace` a consecutive number
 | |
| out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
 | |
| 
 | |
| #head(out[, c("trace", "trace2")], 50)
 | |
| 
 | |
| #' # Export data
 | |
| 
 | |
| write.table(out, "../data/event_logfiles.csv",
 | |
|             sep = ";", quote = FALSE, row.names = FALSE)
 | |
| 
 | |
| # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
 | |
| # unit??? Maybe look at differences between timestamps separately for
 | |
| # `artwork`? And identify "new observational unit" this way?
 | |
| #
 | |
| # Definition: (???)
 | |
| # 1. Touching a new `artwork` corresponds to "observational unit change"
 | |
| # 2. Time interval of XX min within one `artwork` on the same day
 | |
| #    corresponds to "observational unit change"
 | |
| 
 | |
| # Split data frame in list of data frame which all correspond to one
 | |
| # artwork
 | |
| # dat_art <- split(dat, dat$artwork)
 | |
| 
 | |
| # TODO: Write function for closing events
 | |
| 
 |