Started seriously working on data preprocessing; very intermediate version though
This commit is contained in:
parent
45bf0d9af9
commit
e88981e3b9
@ -37,6 +37,7 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)
|
||||
#' Choose which folders with raw log files should be included:
|
||||
|
||||
folders <- "all"
|
||||
#folders <- "_2016b"
|
||||
|
||||
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
|
||||
|
||||
@ -129,6 +130,10 @@ str(dat)
|
||||
|
||||
head(dat[, 2:ncol(dat)], 20)
|
||||
|
||||
# sort by date, since sorting by file names does not make sense because of
|
||||
# missing left zero padding
|
||||
dat <- dat[order(dat$date), ]
|
||||
|
||||
## TODO: Replace artwork and popup numbers with informative strings
|
||||
|
||||
write.table(dat, "../data/rawdata_logfiles.csv",
|
||||
|
224
code/01b_investigate.R
Normal file
224
code/01b_investigate.R
Normal file
@ -0,0 +1,224 @@
|
||||
#' ---
|
||||
#' title: "Preprocessing log files"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' toc: true
|
||||
#' toc_float: true
|
||||
#' pdf_document:
|
||||
#' toc: true
|
||||
#' number_sections: true
|
||||
#' geometry: margin = 2.5cm
|
||||
#' ---
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
# LogEntry classes:
|
||||
# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
|
||||
# TRANSFORM_STOP: "Transform stop"
|
||||
# START_APPLICATION: "Start Application"
|
||||
# SHOW_APPLICATION: "Show Application"
|
||||
# SHOW_INFO: "Show Info" --> "Flip Card" in Tool
|
||||
# SHOW_FRONT: "Show Front"
|
||||
# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
|
||||
# HIDE_POPUP: "HidePopup"
|
||||
# ARTWORK: "Artwork" --> "Show Topic" in Tool
|
||||
|
||||
#' # Read data
|
||||
|
||||
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date) # create date object
|
||||
|
||||
plot(dat0$time_ms[1:3000], type = "l")
|
||||
|
||||
# what happens here? Why does `time_ms` go down, but not to 0?
|
||||
plot(dat0$time_ms[2500:3000], type = "l")
|
||||
plot(dat0$time_ms[2755:2765], type = "l") # "zoom in"
|
||||
dat0[2755:2765, ]
|
||||
# --> overall time stamp keeps going up...
|
||||
|
||||
# TODO: How to create a plot that gives the same information based on
|
||||
# `time_ms` und `date`??
|
||||
plot(time_ms ~ date, dat0[1:5000, ], type = "b")
|
||||
abline(h = 0, col = "red", lty = 3)
|
||||
# Visualize night
|
||||
plot(time_ms ~ date, dat0[1:10000, ], type = "b")
|
||||
|
||||
|
||||
# Not all `Start Application` have `time_ms = 0` - why??
|
||||
|
||||
dat0[125537:125542, ]
|
||||
dat0[6673501:6673510, ]
|
||||
# --> What's happening here?
|
||||
|
||||
table(dat0[dat0$event %in% "Start Application", c("event", "date", "time_ms")]$time_ms)
|
||||
# 0 1 15 16 296 2819 2914 3191 5316 6535
|
||||
# 3131 4 21 48 1 1 1 1 1 1
|
||||
# --> ???
|
||||
dat0[dat0$event == "Start Application" & dat0$time_ms == 6535, ]
|
||||
dat0[989313:989317, ]
|
||||
|
||||
dat0[dat0$event == "Start Application" & dat0$time_ms == 5316, ]
|
||||
dat0[2071078:2071082, ]
|
||||
|
||||
dat0[dat0$event == "Start Application" & dat0$time_ms == 3191, ]
|
||||
dat0[2851863:2851867, ]
|
||||
|
||||
dat0[dat0$event == "Start Application" & dat0$time_ms == 16, ]
|
||||
dat0[156382:156386, ]
|
||||
dat0[5566940:5566947, ]
|
||||
# --> pattern is *not* consistent
|
||||
|
||||
dat0[dat0$event == "Start Application" & dat0$time_ms == 1, ]
|
||||
dat0[125537:125542, ]
|
||||
|
||||
|
||||
xtabs( ~ event + as.Date(date), dat0[1:1000, ])
|
||||
|
||||
# How many days do we have with up to 8 "Start Applications"
|
||||
table(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ]))
|
||||
# 1 2 3 4 5 6 7 8
|
||||
# 381 657 272 86 37 14 10 2
|
||||
# --> 8 days without any "Start Application"
|
||||
length(unique(as.Date(dat0$date))) -
|
||||
length(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ]))
|
||||
|
||||
# But only 6 files with 2 "Start Applications"
|
||||
table(xtabs( ~ event + fileid, dat0[dat0$event == "Start Application", ]))
|
||||
# 1 2
|
||||
# 3198 6
|
||||
# --> That means we have 36,563 file ids without any "Start Application"
|
||||
|
||||
|
||||
#' # Remove irrelevant events
|
||||
|
||||
#' ## Remove Start Application and Show Application
|
||||
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
|
||||
|
||||
#' ## Remove "button presses"
|
||||
|
||||
# Sort data frame by artwork and date
|
||||
dat <- dat[order(dat$artwork, dat$date), ]
|
||||
|
||||
# remove "Transform start" and "Transform stop" following directly each
|
||||
# other, since I do not know how to interpret them as events
|
||||
id_start <- which(dat$event == "Transform start")
|
||||
id_stop <- which(dat$event == "Transform stop")
|
||||
|
||||
id_rm_start <- id_start[diff(id_start) == 1]
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
|
||||
dat <- dat[-c(id_rm_start, id_rm_stop), ]
|
||||
rownames(dat) <- NULL
|
||||
|
||||
|
||||
id_start2 <- which(dat$event == "Transform start")
|
||||
id_stop2 <- which(dat$event == "Transform stop")
|
||||
|
||||
length(id_start2) - length(id_stop2)
|
||||
# 340 --> "starts too many"
|
||||
|
||||
# remove "Transform start" and "Transform stop" following directly each
|
||||
# other (but with events in between!)
|
||||
id_start_new <- id_start2
|
||||
id_stop_new <- id_stop2
|
||||
|
||||
for (i in 2:length(id_start_new)) {
|
||||
if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
|
||||
id_start_new <- id_start_new[-(i-1)]
|
||||
} else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
|
||||
id_stop_new <- id_stop_new[-(i-1)]
|
||||
}
|
||||
}
|
||||
|
||||
length(id_start2) - length(id_start_new)
|
||||
length(id_stop2) - length(id_stop_new)
|
||||
|
||||
ids <- data.frame(start = id_start_new, stop = id_stop_new)
|
||||
ids$diff <- ids$stop - ids$start
|
||||
|
||||
table(ids$diff)
|
||||
|
||||
# remove "Transform start" and "Transform stop" around other events
|
||||
|
||||
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
|
||||
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
|
||||
|
||||
# TODO: It still does not work correctly:
|
||||
dat[64764:64769,]
|
||||
# time_ms event artwork popup x y scale rotation
|
||||
# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
|
||||
# 64765 474226 Show Info 052 052.xml NA NA NA NA
|
||||
# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||||
# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||||
# 64768 479326 Artwork 052 052.xml NA NA NA NA
|
||||
# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534
|
||||
|
||||
# --> but no idea how to find these cases in an automated way...
|
||||
|
||||
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
|
||||
# --> Every start ends with a stop now (but not necessarily the correct one!)
|
||||
|
||||
|
||||
dat1 <- dat[order(dat$date, dat$time_ms), ]
|
||||
dat1$time_diff <- c(NA, diff(dat1$time_ms))
|
||||
|
||||
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
|
||||
|
||||
boxplot(time_ms ~ event, dat1)
|
||||
|
||||
|
||||
#' ## Plots
|
||||
|
||||
counts <- table(as.Date(dat$date), dat$event)
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
|
||||
|
||||
counts <- table(as.Date(dat$date[dat$event %in% start_events]),
|
||||
dat$event[dat$event %in% start_events])
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
# TODO: Do I want to "collapse" the data frame in a way, that I only have
|
||||
# one event for each "set", meaning
|
||||
#
|
||||
# * Transform start + Transform stop --> Transform
|
||||
# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard
|
||||
# * ShowPopup + HidePopup --> Show Popup
|
||||
# * Show Info + Show Front --> Flip Card
|
||||
# (s.o. ;))
|
||||
#
|
||||
# Then I would have meaningful variables like duration, distance, degree of
|
||||
# rotation, size of scaling, selection of Subcard etc.
|
||||
# This means that I would have to delete all "unclosed" events.
|
||||
|
||||
# Create a data frame with
|
||||
# case event attributes (can differ for different events)
|
||||
# ??
|
||||
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
|
||||
# unit??? Maybe look at differences between timestamps separately for
|
||||
# `artwork`? And identify "new observational unit" this way?
|
||||
#
|
||||
# Definition: (???)
|
||||
# 1. Touching a new `artwork` corresponds to "observational unit change"
|
||||
# 2. Time interval of XX min within one `artwork` on the same day
|
||||
# corresponds to "observational unit change"
|
||||
|
||||
# id activity timestamp
|
||||
|
||||
# Split data frame in list of data frame which all correspond to one
|
||||
# artwork
|
||||
# dat_art <- split(dat, dat$artwork)
|
||||
|
||||
## --> Maybe need it at some point?
|
||||
|
||||
#' # Problems
|
||||
|
||||
#' * Opening and closing of events cannot be identified unambiguously; it
|
||||
#' can happen that the wrong tags have been put together (e.g., Transform
|
||||
#' start and Transform stop); therefore, durations etc. are only heuristic
|
||||
|
@ -2,7 +2,7 @@
|
||||
#' title: "Preprocessing log files"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' toc: true
|
||||
#' toc_float: true
|
||||
@ -27,15 +27,140 @@
|
||||
|
||||
#' # Read data
|
||||
|
||||
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
||||
header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date) # create date object
|
||||
|
||||
# TODO: Add a case identifier based on timestamps -- needs to be done on
|
||||
# "raw data". Is it possible? Something seems seriously wrong with
|
||||
# `time_ms`
|
||||
|
||||
#' # Remove irrelevant events
|
||||
|
||||
#' ## Remove Start Application and Show Application
|
||||
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
|
||||
dat$logs <- NULL # do not need original log files
|
||||
dat$date <- as.POSIXct(dat$date) # create date object
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
|
||||
#' # Close events
|
||||
|
||||
#' Do it for Tranform events first
|
||||
tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
|
||||
tmp <- tmp[order(tmp$artwork, tmp$date), ]
|
||||
rownames(tmp) <- NULL
|
||||
|
||||
# Find out how often "Transform start" follows each other
|
||||
num_start <- diff(c(0, which(tmp$event == "Transform stop")))
|
||||
tmp$eventid <- rep(seq_along(num_start), num_start)
|
||||
head(tmp[, c("event", "eventid")], 25)
|
||||
|
||||
table(table(tmp$eventid))
|
||||
# 1 2 3 4 5 6 7 8 10 11
|
||||
# 73 78429 5156 842 222 66 18 14 3 1
|
||||
# --> compare to table(num_start)!
|
||||
|
||||
# Find out how often "Transform stop" follows each other
|
||||
num_stop <- c(diff(c(0, which(tmp$event == "Transform start"))))
|
||||
table(num_stop)
|
||||
|
||||
tmp$eventrep <- rep(num_start, num_start)
|
||||
tmp$dupl <- duplicated(tmp[, c("event", "eventid")]) # keep first
|
||||
tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE) # keep last
|
||||
tmp[tmp$eventrep == 10, ]
|
||||
|
||||
tmp$dupl <- NULL
|
||||
tmp$eventrep <- NULL
|
||||
|
||||
|
||||
# remove duplicated "Transform start" events
|
||||
tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ]
|
||||
|
||||
# remove duplicated "Transform stop" events
|
||||
id_stop <- which(tmp$event == "Transform stop")
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
|
||||
tmp <- tmp[-(id_rm_stop + 1), ]
|
||||
|
||||
# transform to wide data format
|
||||
tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop")
|
||||
|
||||
trans_wide <- reshape(tmp, direction = "wide",
|
||||
idvar = c("eventid", "artwork"),
|
||||
timevar = "event", drop = c("fileid", "popup", "card")
|
||||
)
|
||||
|
||||
rownames(trans_wide) <- NULL
|
||||
# --> when fileid is part of the reshape, it does not work correctly, since
|
||||
# we sometimes have a start - stop event that is recorded in two separate
|
||||
# log files
|
||||
# TODO: This runs for quite some time. Is this more efficient with dplyr?
|
||||
|
||||
# which(is.na(trans_wide$date.start))
|
||||
|
||||
trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
|
||||
trans_wide$distance <- apply(trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")],
|
||||
1, function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
|
||||
trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start
|
||||
trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start
|
||||
|
||||
trans_wide <- trans_wide[trans_wide$distance != 0 &
|
||||
trans_wide$rotationDegree != 0 &
|
||||
trans_wide$scaleSize != 0, ]
|
||||
# removes almost 2/3 of the data (for small data set)
|
||||
|
||||
|
||||
# TODO: How do I handle popups from glossar???
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Should every "Show front" be the beginning of a new trace?
|
||||
# Should Transform events be handled separately and then be "added" again
|
||||
# by timestamp?
|
||||
|
||||
########
|
||||
tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
|
||||
rownames(tmp) <- NULL
|
||||
|
||||
tmp$trace <- NA
|
||||
last_event <- tmp$event[1]
|
||||
|
||||
for (art in unique(tmp$artwork)) {
|
||||
|
||||
for (i in 1:nrow(tmp)) {
|
||||
|
||||
if (last_event == "Show Info" & (tmp$artwork[i] == art |
|
||||
tmp$artwork[i] == "glossar")) {
|
||||
tmp$trace[i] <- "start"
|
||||
} else if (last_event == "Show Front" & (tmp$artwork[i] == art |
|
||||
tmp$artwork[i] == "glossar")) {
|
||||
tmp$trace[i] <- "stop"
|
||||
}
|
||||
last_event <- tmp$event[i + 1]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
head(tmp[4:ncol(tmp)], 50)
|
||||
# TODO: Great job! You used a for-loop to rename "Show info" and "Show
|
||||
# front" to "start" and "stop" ;)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#' ## Remove "button presses"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user