122 lines
4.1 KiB
R
122 lines
4.1 KiB
R
#' ---
|
|
#' title: "HMC data cleaning"
|
|
#' author: "Nora Wickelmaier"
|
|
#' date: 2025-10-14
|
|
#' ---
|
|
|
|
# clear workspace
|
|
rm(list = ls())
|
|
|
|
# Read anonymized data
|
|
dat1 <- read.csv("HMC_wave1_anonymized.csv", na.strings = c("-99", "NA"))
|
|
dat2 <- read.csv("HMC_wave2_anonymized.csv", na.strings = c("-99", "NA"))
|
|
dat3 <- read.csv("HMC_wave3_anonymized.csv", na.strings = c("-99", "NA"))
|
|
dat4 <- read.csv("HMC_wave4_anonymized.csv", na.strings = c("-99", "NA"))
|
|
dat5 <- read.csv("HMC_wave5_anonymized.csv", na.strings = c("-99", "NA"))
|
|
dat6 <- read.csv("HMC_wave6_anonymized.csv", na.strings = c("-99", "NA"))
|
|
|
|
# Check if subjects are double within waves
|
|
which(table(dat1$subj_id) > 1)
|
|
which(table(dat2$subj_id) > 1)
|
|
which(table(dat3$subj_id) > 1)
|
|
which(table(dat4$subj_id) > 1)
|
|
which(table(dat5$subj_id) > 1)
|
|
which(table(dat6$subj_id) > 1)
|
|
|
|
dat1[dat1$subj_id == names(which(table(dat1$subj_id) > 1)),
|
|
c("StartDate", "subj_id", "Duration_in_seconds")]
|
|
dat3[dat3$subj_id == names(which(table(dat3$subj_id) > 1)),
|
|
c("StartDate", "subj_id", "Duration_in_seconds")]
|
|
|
|
# Only keep first entry
|
|
dat1 <- subset(dat1, !duplicated(dat1$subj_id))
|
|
dat3 <- subset(dat3, !duplicated(dat3$subj_id))
|
|
|
|
# Remove subject that suddenly emerged in wave 3 (which should not be possible,
|
|
# since only participants from wave 1 were invited for participation)
|
|
dat3 <- subset(dat3, subj_id != "subj1009")
|
|
|
|
# Remove empty variables wave 1
|
|
which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |> names()
|
|
dat1 <- subset(dat1, select = !grepl("^Q1.*", names(dat1)))
|
|
|
|
# Check for other variables that are empty, but are supposed to be filled
|
|
empty_var_w1 <- which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |>
|
|
names() |> print()
|
|
empty_var_w2 <- which(apply(dat2, 2, function(x) is.na(x) |> sum()) == nrow(dat2)) |>
|
|
names() |> print()
|
|
empty_var_w3 <- which(apply(dat3, 2, function(x) is.na(x) |> sum()) == nrow(dat3)) |>
|
|
names() |> print()
|
|
empty_var_w4 <- which(apply(dat4, 2, function(x) is.na(x) |> sum()) == nrow(dat4)) |>
|
|
names() |> print()
|
|
empty_var_w5 <- which(apply(dat5, 2, function(x) is.na(x) |> sum()) == nrow(dat5)) |>
|
|
names() |> print()
|
|
empty_var_w6 <- which(apply(dat6, 2, function(x) is.na(x) |> sum()) == nrow(dat6)) |>
|
|
names() |> print()
|
|
|
|
# Problems in wave 1
|
|
# Theory: *After* data collection survey was altered and a new variable was
|
|
# created. Data presentation was correct. (Angelica and Nora checked it via view
|
|
# response in Qualtrics.)
|
|
#
|
|
# delg_tsk_typs_3 needs to be deleted
|
|
# delg_tsk_typs_4 --> delg_tsk_typs_3
|
|
# delg_tsk_typs_5 --> delg_tsk_typs_4
|
|
# delg_tsk_typs_6 --> delg_tsk_typs_5
|
|
# delg_tsk_typs_7 --> delg_tsk_typs_6
|
|
# delg_tsk_typs_8 --> delg_tsk_typs_7
|
|
|
|
# Rename variables as above
|
|
dat1$delg_tsk_typs_3 <- dat1$delg_tsk_typs_4
|
|
dat1$delg_tsk_typs_4 <- dat1$delg_tsk_typs_5
|
|
dat1$delg_tsk_typs_5 <- dat1$delg_tsk_typs_6
|
|
dat1$delg_tsk_typs_6 <- dat1$delg_tsk_typs_7
|
|
dat1$delg_tsk_typs_7 <- dat1$delg_tsk_typs_8
|
|
|
|
dat1$delg_tsk_typs_8 <- NULL
|
|
|
|
# fav_Icecream is a bot detection item --> should it be removed?
|
|
|
|
|
|
# Read codebook and rename variables for all waves
|
|
cb <- openxlsx::read.xlsx("../HMC_codebook.xlsx")
|
|
|
|
rename_data <- function(data, wave) {
|
|
data <- data[, cb$qualtricsname[cb[[wave]] == "x" | is.na(cb[[wave]])]]
|
|
names(data) <- cb$varname[cb[[wave]] == "x" | is.na(cb[[wave]])]
|
|
data
|
|
}
|
|
|
|
df1 <- rename_data(dat1, "wave1")
|
|
df2 <- rename_data(dat2, "wave2")
|
|
df3 <- rename_data(dat3, "wave3")
|
|
df4 <- rename_data(dat4, "wave4")
|
|
df5 <- rename_data(dat5, "wave5")
|
|
df6 <- rename_data(dat6, "wave6")
|
|
|
|
# identical(dat1$tsk_typ_if_wrtng_gnr_1, df1$tsk_typ_if_wrtng_1)
|
|
|
|
|
|
save_dataframe <- function(data, wave, path = "../03_cleaned_data/") {
|
|
# as CSV
|
|
write.csv(data,
|
|
paste0(path, "/HMC_", wave, "_cleaned.csv"),
|
|
row.names = FALSE)
|
|
# # as XLSX
|
|
# openxlsx::write.xlsx(data,
|
|
# paste0(path, "/HMC_", wave, "_cleaned.xlsx"),
|
|
# rowNames = FALSE)
|
|
# # as SAV
|
|
# haven::write_sav(data,
|
|
# paste0(path, "/HMC_", wave, "_cleaned.sav"))
|
|
}
|
|
|
|
# Save anonymized data frames
|
|
save_dataframe(df1, "wave1")
|
|
save_dataframe(df2, "wave2")
|
|
save_dataframe(df3, "wave3")
|
|
save_dataframe(df4, "wave4")
|
|
save_dataframe(df5, "wave5")
|
|
save_dataframe(df6, "wave6")
|
|
|