290 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			290 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
#' ---
 | 
						|
#' title: "Preprocessing log files"
 | 
						|
#' author: "Nora Wickelmaier"
 | 
						|
#' date: "`r Sys.Date()`"
 | 
						|
#' output:
 | 
						|
#'   html_document:
 | 
						|
#'     toc: true
 | 
						|
#'     toc_float: true
 | 
						|
#'   pdf_document:
 | 
						|
#'     toc: true
 | 
						|
#'     number_sections: true
 | 
						|
#' geometry: margin = 2.5cm
 | 
						|
#' ---
 | 
						|
 | 
						|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 | 
						|
 | 
						|
# LogEntry classes:
 | 
						|
#   TRANSFORM_START:    "Transform start" --> "Transformation Start" in Tool
 | 
						|
#   TRANSFORM_STOP:     "Transform stop"
 | 
						|
#   START_APPLICATION:  "Start Application"
 | 
						|
#   SHOW_APPLICATION:   "Show Application"
 | 
						|
#   SHOW_INFO:          "Show Info"       --> "Flip Card" in Tool
 | 
						|
#   SHOW_FRONT:         "Show Front"
 | 
						|
#   SHOW_POPUP:         "ShowPopup"       --> "Show Popup" in Tool
 | 
						|
#   HIDE_POPUP:         "HidePopup"
 | 
						|
#   ARTWORK:            "Artwork"         --> "Show Topic" in Tool
 | 
						|
 | 
						|
#' # Read data
 | 
						|
 | 
						|
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
 | 
						|
                   header = TRUE)
 | 
						|
dat0$date <- as.POSIXct(dat0$date)  # create date object
 | 
						|
 | 
						|
# TODO: Add a case identifier based on timestamps -- needs to be done on
 | 
						|
# "raw data". Is it possible? Something seems seriously wrong with
 | 
						|
# `time_ms`
 | 
						|
 | 
						|
#' # Remove irrelevant events
 | 
						|
 | 
						|
#' ## Remove Start Application and Show Application
 | 
						|
 | 
						|
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
 | 
						|
                                        "Show Application")))
 | 
						|
 | 
						|
#' # Close events
 | 
						|
 | 
						|
#' Do it for Tranform events first
 | 
						|
tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
 | 
						|
tmp <- tmp[order(tmp$artwork, tmp$date), ]
 | 
						|
rownames(tmp) <- NULL
 | 
						|
 | 
						|
# Find out how often "Transform start" follows each other
 | 
						|
num_start <- diff(c(0, which(tmp$event == "Transform stop")))
 | 
						|
tmp$eventid <- rep(seq_along(num_start), num_start)
 | 
						|
head(tmp[, c("event", "eventid")], 25)
 | 
						|
 | 
						|
table(table(tmp$eventid))
 | 
						|
#   1     2     3     4     5     6     7     8    10    11
 | 
						|
#  73 78429  5156   842   222    66    18    14     3     1
 | 
						|
# --> compare to table(num_start)!
 | 
						|
 | 
						|
# Find out how often "Transform stop" follows each other
 | 
						|
num_stop <- c(diff(c(0, which(tmp$event == "Transform start"))))
 | 
						|
table(num_stop)
 | 
						|
 | 
						|
tmp$eventrep <- rep(num_start, num_start)
 | 
						|
tmp$dupl <- duplicated(tmp[, c("event", "eventid")])                    # keep first
 | 
						|
tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE)   # keep last
 | 
						|
tmp[tmp$eventrep == 10, ]
 | 
						|
 | 
						|
tmp$dupl <- NULL
 | 
						|
tmp$eventrep <- NULL
 | 
						|
 | 
						|
 | 
						|
# remove duplicated "Transform start" events
 | 
						|
tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ]
 | 
						|
 | 
						|
# remove duplicated "Transform stop" events
 | 
						|
id_stop  <- which(tmp$event == "Transform stop")
 | 
						|
id_rm_stop <- id_stop[diff(id_stop) == 1]
 | 
						|
 | 
						|
tmp <- tmp[-(id_rm_stop + 1), ]
 | 
						|
 | 
						|
# transform to wide data format
 | 
						|
tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop")
 | 
						|
 | 
						|
trans_wide <- reshape(tmp, direction = "wide",
 | 
						|
                      idvar = c("eventid", "artwork"),
 | 
						|
                      timevar = "event", drop = c("fileid", "popup", "card")
 | 
						|
)
 | 
						|
 | 
						|
rownames(trans_wide) <- NULL
 | 
						|
# --> when fileid is part of the reshape, it does not work correctly, since
 | 
						|
# we sometimes have a start - stop event that is recorded in two separate
 | 
						|
# log files
 | 
						|
# TODO: This runs for quite some time. Is this more efficient with dplyr?
 | 
						|
 | 
						|
# which(is.na(trans_wide$date.start))
 | 
						|
 | 
						|
trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
 | 
						|
trans_wide$distance <- apply(trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")],
 | 
						|
      1, function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
 | 
						|
trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start
 | 
						|
trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start
 | 
						|
 | 
						|
trans_wide <- trans_wide[trans_wide$distance != 0 &
 | 
						|
                         trans_wide$rotationDegree != 0 &
 | 
						|
                         trans_wide$scaleSize != 0, ]
 | 
						|
# removes almost 2/3 of the data (for small data set)
 | 
						|
 | 
						|
 | 
						|
# TODO: How do I handle popups from glossar???
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# Should every "Show front" be the beginning of a new trace?
 | 
						|
# Should Transform events be handled separately and then be "added" again
 | 
						|
# by timestamp?
 | 
						|
 | 
						|
########
 | 
						|
tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
 | 
						|
rownames(tmp) <- NULL
 | 
						|
 | 
						|
tmp$trace <- NA
 | 
						|
last_event <- tmp$event[1]
 | 
						|
 | 
						|
for (art in unique(tmp$artwork)) {
 | 
						|
 | 
						|
  for (i in 1:nrow(tmp)) {
 | 
						|
 | 
						|
    if (last_event == "Show Info" & (tmp$artwork[i] == art |
 | 
						|
                                     tmp$artwork[i] == "glossar")) {
 | 
						|
      tmp$trace[i] <- "start"
 | 
						|
    } else if (last_event == "Show Front" & (tmp$artwork[i] == art |
 | 
						|
                                             tmp$artwork[i] == "glossar")) {
 | 
						|
      tmp$trace[i] <- "stop"
 | 
						|
    }
 | 
						|
  last_event <- tmp$event[i + 1]
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
head(tmp[4:ncol(tmp)], 50)
 | 
						|
# TODO: Great job! You used a for-loop to rename "Show info" and "Show
 | 
						|
# front" to "start" and "stop" ;)
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#' ## Remove "button presses"
 | 
						|
 | 
						|
# Sort data frame by artwork and date
 | 
						|
dat <- dat[order(dat$artwork, dat$date), ]
 | 
						|
 | 
						|
# remove "Transform start" and "Transform stop" following directly each
 | 
						|
# other, since I do not know how to interpret them as events
 | 
						|
id_start <- which(dat$event == "Transform start")
 | 
						|
id_stop  <- which(dat$event == "Transform stop")
 | 
						|
 | 
						|
id_rm_start <- id_start[diff(id_start) == 1]
 | 
						|
id_rm_stop <- id_stop[diff(id_stop) == 1]
 | 
						|
 | 
						|
dat <- dat[-c(id_rm_start, id_rm_stop), ]
 | 
						|
rownames(dat) <- NULL
 | 
						|
 | 
						|
 | 
						|
id_start2 <- which(dat$event == "Transform start")
 | 
						|
id_stop2  <- which(dat$event == "Transform stop")
 | 
						|
 | 
						|
length(id_start2) - length(id_stop2)
 | 
						|
# 340 --> "starts too many"
 | 
						|
 | 
						|
# remove "Transform start" and "Transform stop" following directly each
 | 
						|
# other (but with events in between!)
 | 
						|
id_start_new <- id_start2
 | 
						|
id_stop_new <- id_stop2
 | 
						|
 | 
						|
for (i in 2:length(id_start_new)) {
 | 
						|
  if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
 | 
						|
    id_start_new <- id_start_new[-(i-1)]
 | 
						|
  } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
 | 
						|
    id_stop_new <- id_stop_new[-(i-1)]
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
length(id_start2) - length(id_start_new)
 | 
						|
length(id_stop2) - length(id_stop_new)
 | 
						|
 | 
						|
ids <- data.frame(start = id_start_new, stop = id_stop_new)
 | 
						|
ids$diff <- ids$stop - ids$start
 | 
						|
 | 
						|
table(ids$diff)
 | 
						|
 | 
						|
# remove "Transform start" and "Transform stop" around other events
 | 
						|
 | 
						|
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
 | 
						|
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
 | 
						|
 | 
						|
# TODO: It still does not work correctly:
 | 
						|
dat[64764:64769,]
 | 
						|
#        time_ms           event artwork   popup       x       y     scale   rotation
 | 
						|
# 64764   473081 Transform start     052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
 | 
						|
# 64765   474226       Show Info     052 052.xml      NA      NA        NA         NA
 | 
						|
# 64766   475735 Transform start     052 052.xml 1988.25 1625.25 0.9927645  2.4527958
 | 
						|
# 64767   475739  Transform stop     052 052.xml 1988.25 1625.25 0.9927645  2.4527958
 | 
						|
# 64768   479326         Artwork     052 052.xml      NA      NA        NA         NA
 | 
						|
# 64769   479751  Transform stop     052 052.xml 1660.90 1883.20 0.8074586 29.0875534
 | 
						|
 | 
						|
# --> but no idea how to find these cases in an automated way...
 | 
						|
 | 
						|
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
 | 
						|
# --> Every start ends with a stop now (but not necessarily the correct one!)
 | 
						|
 | 
						|
 | 
						|
dat1 <- dat[order(dat$date, dat$time_ms), ]
 | 
						|
dat1$time_diff <- c(NA, diff(dat1$time_ms))
 | 
						|
 | 
						|
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
 | 
						|
 | 
						|
boxplot(time_ms ~ event, dat1)
 | 
						|
 | 
						|
 | 
						|
#' ## Plots
 | 
						|
 | 
						|
counts <- table(as.Date(dat$date), dat$event)
 | 
						|
lattice::barchart(counts, auto.key = TRUE)
 | 
						|
 | 
						|
 | 
						|
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
 | 
						|
 | 
						|
counts <- table(as.Date(dat$date[dat$event %in% start_events]),
 | 
						|
                dat$event[dat$event %in% start_events])
 | 
						|
lattice::barchart(counts, auto.key = TRUE)
 | 
						|
 | 
						|
 | 
						|
# TODO: Do I want to "collapse" the data frame in a way, that I only have
 | 
						|
# one event for each "set", meaning
 | 
						|
#
 | 
						|
# * Transform start   + Transform stop     --> Transform
 | 
						|
# * Artwork/OpenCard  + Artwork/CloseCard  --> Show Subcard
 | 
						|
# * ShowPopup         + HidePopup          --> Show Popup
 | 
						|
# * Show Info         + Show Front         --> Flip Card
 | 
						|
# (s.o. ;))
 | 
						|
#
 | 
						|
# Then I would have meaningful variables like duration, distance, degree of
 | 
						|
# rotation, size of scaling, selection of Subcard etc.
 | 
						|
# This means that I would have to delete all "unclosed" events.
 | 
						|
 | 
						|
# Create a data frame with
 | 
						|
# case    event     attributes (can differ for different events)
 | 
						|
# ??
 | 
						|
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
 | 
						|
# unit??? Maybe look at differences between timestamps separately for
 | 
						|
# `artwork`? And identify "new observational unit" this way?
 | 
						|
#
 | 
						|
# Definition: (???)
 | 
						|
# 1. Touching a new `artwork` corresponds to "observational unit change"
 | 
						|
# 2. Time interval of XX min within one `artwork` on the same day
 | 
						|
#    corresponds to "observational unit change"
 | 
						|
 | 
						|
# id    activity    timestamp
 | 
						|
 | 
						|
# Split data frame in list of data frame which all correspond to one
 | 
						|
# artwork
 | 
						|
# dat_art <- split(dat, dat$artwork)
 | 
						|
 | 
						|
## --> Maybe need it at some point?
 | 
						|
 | 
						|
#' # Problems
 | 
						|
 | 
						|
#' * Opening and closing of events cannot be identified unambiguously; it
 | 
						|
#'   can happen that the wrong tags have been put together (e.g., Transform
 | 
						|
#'   start and Transform stop); therefore, durations etc. are only heuristic
 | 
						|
 |