#' --- #' title: "HMC data cleaning" #' author: "Nora Wickelmaier" #' date: 2025-10-14 #' --- # clear workspace rm(list = ls()) # Read anonymized data dat1 <- read.csv("HMC_wave1_anonymized.csv", na.strings = c("-99", "NA")) dat2 <- read.csv("HMC_wave2_anonymized.csv", na.strings = c("-99", "NA")) dat3 <- read.csv("HMC_wave3_anonymized.csv", na.strings = c("-99", "NA")) dat4 <- read.csv("HMC_wave4_anonymized.csv", na.strings = c("-99", "NA")) dat5 <- read.csv("HMC_wave5_anonymized.csv", na.strings = c("-99", "NA")) dat6 <- read.csv("HMC_wave6_anonymized.csv", na.strings = c("-99", "NA")) # Check if subjects are double within waves which(table(dat1$subj_id) > 1) which(table(dat2$subj_id) > 1) which(table(dat3$subj_id) > 1) which(table(dat4$subj_id) > 1) which(table(dat5$subj_id) > 1) which(table(dat6$subj_id) > 1) dat1[dat1$subj_id == names(which(table(dat1$subj_id) > 1)), c("StartDate", "subj_id", "Duration_in_seconds")] dat3[dat3$subj_id == names(which(table(dat3$subj_id) > 1)), c("StartDate", "subj_id", "Duration_in_seconds")] # Only keep first entry dat1 <- subset(dat1, !duplicated(dat1$subj_id)) dat3 <- subset(dat3, !duplicated(dat3$subj_id)) # Remove empty variables wave 1 which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |> names() dat1 <- subset(dat1, select = !grepl("^Q1.*", names(dat1))) # Check for other variables that are empty, but are supposed to be filled empty_var_w1 <- which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |> names() |> print() empty_var_w2 <- which(apply(dat2, 2, function(x) is.na(x) |> sum()) == nrow(dat2)) |> names() |> print() empty_var_w3 <- which(apply(dat3, 2, function(x) is.na(x) |> sum()) == nrow(dat3)) |> names() |> print() empty_var_w4 <- which(apply(dat4, 2, function(x) is.na(x) |> sum()) == nrow(dat4)) |> names() |> print() empty_var_w5 <- which(apply(dat5, 2, function(x) is.na(x) |> sum()) == nrow(dat5)) |> names() |> print() empty_var_w6 <- which(apply(dat6, 2, function(x) is.na(x) |> sum()) == nrow(dat6)) |> names() |> print() # Problems in wave 1 # Theory: *After* data collection survey was altered and a new variable was # created. Data presentation was correct. (Angelica and Nora checked it via view # response in Qualtrics.) # # delg_tsk_typs_3 needs to be deleted # delg_tsk_typs_4 --> delg_tsk_typs_3 # delg_tsk_typs_5 --> delg_tsk_typs_4 # delg_tsk_typs_6 --> delg_tsk_typs_5 # delg_tsk_typs_7 --> delg_tsk_typs_6 # delg_tsk_typs_8 --> delg_tsk_typs_7 # Rename variables as above dat1$delg_tsk_typs_3 <- dat1$delg_tsk_typs_4 dat1$delg_tsk_typs_4 <- dat1$delg_tsk_typs_5 dat1$delg_tsk_typs_5 <- dat1$delg_tsk_typs_6 dat1$delg_tsk_typs_6 <- dat1$delg_tsk_typs_7 dat1$delg_tsk_typs_7 <- dat1$delg_tsk_typs_8 dat1$delg_tsk_typs_8 <- NULL # fav_Icecream is a bot detection item --> should it be removed? # Read codebook and rename variables for all waves cb <- openxlsx::read.xlsx("../HMC_codebook.xlsx") rename_data <- function(data, wave) { data <- data[, cb$qualtricsname[cb[[wave]] == "x" | is.na(cb[[wave]])]] names(data) <- cb$varname[cb[[wave]] == "x" | is.na(cb[[wave]])] data } df1 <- rename_data(dat1, "wave1") df2 <- rename_data(dat2, "wave2") df3 <- rename_data(dat3, "wave3") df4 <- rename_data(dat4, "wave4") df5 <- rename_data(dat5, "wave5") df6 <- rename_data(dat6, "wave6") # identical(dat1$tsk_typ_if_wrtng_gnr_1, df1$tsk_typ_if_wrtng_1) save_dataframe <- function(data, wave, path = "../03_cleaned_data/") { # as CSV write.csv(data, paste0(path, "/HMC_", wave, "_cleaned.csv"), row.names = FALSE) # # as XLSX # openxlsx::write.xlsx(data, # paste0(path, "/HMC_", wave, "_cleaned.xlsx"), # rowNames = FALSE) # # as SAV # haven::write_sav(data, # paste0(path, "/HMC_", wave, "_cleaned.sav")) } # Save anonymized data frames save_dataframe(df1, "wave1") save_dataframe(df2, "wave2") save_dataframe(df3, "wave3") save_dataframe(df4, "wave4") save_dataframe(df5, "wave5") save_dataframe(df6, "wave6")