118 lines
3.9 KiB
R

#' ---
#' title: "HMC data cleaning"
#' author: "Nora Wickelmaier"
#' date: 2025-10-14
#' ---
# clear workspace
rm(list = ls())
# Read anonymized data
dat1 <- read.csv("HMC_wave1_anonymized.csv", na.strings = c("-99", "NA"))
dat2 <- read.csv("HMC_wave2_anonymized.csv", na.strings = c("-99", "NA"))
dat3 <- read.csv("HMC_wave3_anonymized.csv", na.strings = c("-99", "NA"))
dat4 <- read.csv("HMC_wave4_anonymized.csv", na.strings = c("-99", "NA"))
dat5 <- read.csv("HMC_wave5_anonymized.csv", na.strings = c("-99", "NA"))
dat6 <- read.csv("HMC_wave6_anonymized.csv", na.strings = c("-99", "NA"))
# Check if subjects are double within waves
which(table(dat1$subj_id) > 1)
which(table(dat2$subj_id) > 1)
which(table(dat3$subj_id) > 1)
which(table(dat4$subj_id) > 1)
which(table(dat5$subj_id) > 1)
which(table(dat6$subj_id) > 1)
dat1[dat1$subj_id == names(which(table(dat1$subj_id) > 1)),
c("StartDate", "subj_id", "Duration_in_seconds")]
dat3[dat3$subj_id == names(which(table(dat3$subj_id) > 1)),
c("StartDate", "subj_id", "Duration_in_seconds")]
# Only keep first entry
dat1 <- subset(dat1, !duplicated(dat1$subj_id))
dat3 <- subset(dat3, !duplicated(dat3$subj_id))
# Remove empty variables wave 1
which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |> names()
dat1 <- subset(dat1, select = !grepl("^Q1.*", names(dat1)))
# Check for other variables that are empty, but are supposed to be filled
empty_var_w1 <- which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |>
names() |> print()
empty_var_w2 <- which(apply(dat2, 2, function(x) is.na(x) |> sum()) == nrow(dat2)) |>
names() |> print()
empty_var_w3 <- which(apply(dat3, 2, function(x) is.na(x) |> sum()) == nrow(dat3)) |>
names() |> print()
empty_var_w4 <- which(apply(dat4, 2, function(x) is.na(x) |> sum()) == nrow(dat4)) |>
names() |> print()
empty_var_w5 <- which(apply(dat5, 2, function(x) is.na(x) |> sum()) == nrow(dat5)) |>
names() |> print()
empty_var_w6 <- which(apply(dat6, 2, function(x) is.na(x) |> sum()) == nrow(dat6)) |>
names() |> print()
# Problems in wave 1
# Theory: *After* data collection survey was altered and a new variable was
# created. Data presentation was correct. (Angelica and Nora checked it via view
# response in Qualtrics.)
#
# delg_tsk_typs_3 needs to be deleted
# delg_tsk_typs_4 --> delg_tsk_typs_3
# delg_tsk_typs_5 --> delg_tsk_typs_4
# delg_tsk_typs_6 --> delg_tsk_typs_5
# delg_tsk_typs_7 --> delg_tsk_typs_6
# delg_tsk_typs_8 --> delg_tsk_typs_7
# Rename variables as above
dat1$delg_tsk_typs_3 <- dat1$delg_tsk_typs_4
dat1$delg_tsk_typs_4 <- dat1$delg_tsk_typs_5
dat1$delg_tsk_typs_5 <- dat1$delg_tsk_typs_6
dat1$delg_tsk_typs_6 <- dat1$delg_tsk_typs_7
dat1$delg_tsk_typs_7 <- dat1$delg_tsk_typs_8
dat1$delg_tsk_typs_8 <- NULL
# fav_Icecream is a bot detection item --> should it be removed?
# Read codebook and rename variables for all waves
cb <- openxlsx::read.xlsx("../HMC_codebook.xlsx")
rename_data <- function(data, wave) {
data <- data[, cb$qualtricsname[cb[[wave]] == "x" | is.na(cb[[wave]])]]
names(data) <- cb$varname[cb[[wave]] == "x" | is.na(cb[[wave]])]
data
}
df1 <- rename_data(dat1, "wave1")
df2 <- rename_data(dat2, "wave2")
df3 <- rename_data(dat3, "wave3")
df4 <- rename_data(dat4, "wave4")
df5 <- rename_data(dat5, "wave5")
df6 <- rename_data(dat6, "wave6")
# identical(dat1$tsk_typ_if_wrtng_gnr_1, df1$tsk_typ_if_wrtng_1)
save_dataframe <- function(data, wave, path = "../03_cleaned_data/") {
# as CSV
write.csv(data,
paste0(path, "/HMC_", wave, "_cleaned.csv"),
row.names = FALSE)
# # as XLSX
# openxlsx::write.xlsx(data,
# paste0(path, "/HMC_", wave, "_cleaned.xlsx"),
# rowNames = FALSE)
# # as SAV
# haven::write_sav(data,
# paste0(path, "/HMC_", wave, "_cleaned.sav"))
}
# Save anonymized data frames
save_dataframe(df1, "wave1")
save_dataframe(df2, "wave2")
save_dataframe(df3, "wave3")
save_dataframe(df4, "wave4")
save_dataframe(df5, "wave5")
save_dataframe(df6, "wave6")