Initialize repository
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
#' ---
|
||||
#' title: "HMC data cleaning"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: 2025-10-14
|
||||
#' ---
|
||||
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
|
||||
# Read anonymized data
|
||||
dat1 <- read.csv("HMC_wave1_anonymized.csv", na.strings = c("-99", "NA"))
|
||||
dat2 <- read.csv("HMC_wave2_anonymized.csv", na.strings = c("-99", "NA"))
|
||||
dat3 <- read.csv("HMC_wave3_anonymized.csv", na.strings = c("-99", "NA"))
|
||||
dat4 <- read.csv("HMC_wave4_anonymized.csv", na.strings = c("-99", "NA"))
|
||||
dat5 <- read.csv("HMC_wave5_anonymized.csv", na.strings = c("-99", "NA"))
|
||||
dat6 <- read.csv("HMC_wave6_anonymized.csv", na.strings = c("-99", "NA"))
|
||||
|
||||
# Check if subjects are double within waves
|
||||
which(table(dat1$subj_id) > 1)
|
||||
which(table(dat2$subj_id) > 1)
|
||||
which(table(dat3$subj_id) > 1)
|
||||
which(table(dat4$subj_id) > 1)
|
||||
which(table(dat5$subj_id) > 1)
|
||||
which(table(dat6$subj_id) > 1)
|
||||
|
||||
dat1[dat1$subj_id == names(which(table(dat1$subj_id) > 1)),
|
||||
c("StartDate", "subj_id", "Duration_in_seconds")]
|
||||
dat3[dat3$subj_id == names(which(table(dat3$subj_id) > 1)),
|
||||
c("StartDate", "subj_id", "Duration_in_seconds")]
|
||||
|
||||
# Only keep first entry
|
||||
dat1 <- subset(dat1, !duplicated(dat1$subj_id))
|
||||
dat3 <- subset(dat3, !duplicated(dat3$subj_id))
|
||||
|
||||
# Remove empty variables wave 1
|
||||
which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |> names()
|
||||
dat1 <- subset(dat1, select = !grepl("^Q1.*", names(dat1)))
|
||||
|
||||
# Check for other variables that are empty, but are supposed to be filled
|
||||
empty_var_w1 <- which(apply(dat1, 2, function(x) is.na(x) |> sum()) == nrow(dat1)) |>
|
||||
names() |> print()
|
||||
empty_var_w2 <- which(apply(dat2, 2, function(x) is.na(x) |> sum()) == nrow(dat2)) |>
|
||||
names() |> print()
|
||||
empty_var_w3 <- which(apply(dat3, 2, function(x) is.na(x) |> sum()) == nrow(dat3)) |>
|
||||
names() |> print()
|
||||
empty_var_w4 <- which(apply(dat4, 2, function(x) is.na(x) |> sum()) == nrow(dat4)) |>
|
||||
names() |> print()
|
||||
empty_var_w5 <- which(apply(dat5, 2, function(x) is.na(x) |> sum()) == nrow(dat5)) |>
|
||||
names() |> print()
|
||||
empty_var_w6 <- which(apply(dat6, 2, function(x) is.na(x) |> sum()) == nrow(dat6)) |>
|
||||
names() |> print()
|
||||
|
||||
# Problems in wave 1
|
||||
# Theory: *After* data collection survey was altered and a new variable was
|
||||
# created. Data presentation was correct. (Angelica and Nora checked it via view
|
||||
# response in Qualtrics.)
|
||||
#
|
||||
# delg_tsk_typs_3 needs to be deleted
|
||||
# delg_tsk_typs_4 --> delg_tsk_typs_3
|
||||
# delg_tsk_typs_5 --> delg_tsk_typs_4
|
||||
# delg_tsk_typs_6 --> delg_tsk_typs_5
|
||||
# delg_tsk_typs_7 --> delg_tsk_typs_6
|
||||
# delg_tsk_typs_8 --> delg_tsk_typs_7
|
||||
|
||||
# Rename variables as above
|
||||
dat1$delg_tsk_typs_3 <- dat1$delg_tsk_typs_4
|
||||
dat1$delg_tsk_typs_4 <- dat1$delg_tsk_typs_5
|
||||
dat1$delg_tsk_typs_5 <- dat1$delg_tsk_typs_6
|
||||
dat1$delg_tsk_typs_6 <- dat1$delg_tsk_typs_7
|
||||
dat1$delg_tsk_typs_7 <- dat1$delg_tsk_typs_8
|
||||
|
||||
dat1$delg_tsk_typs_8 <- NULL
|
||||
|
||||
# fav_Icecream is a bot detection item --> should it be removed?
|
||||
|
||||
|
||||
# Read codebook and rename variables for all waves
|
||||
cb <- openxlsx::read.xlsx("../HMC_codebook.xlsx")
|
||||
|
||||
rename_data <- function(data, wave) {
|
||||
data <- data[, cb$qualtricsname[cb[[wave]] == "x" | is.na(cb[[wave]])]]
|
||||
names(data) <- cb$varname[cb[[wave]] == "x" | is.na(cb[[wave]])]
|
||||
data
|
||||
}
|
||||
|
||||
df1 <- rename_data(dat1, "wave1")
|
||||
df2 <- rename_data(dat2, "wave2")
|
||||
df3 <- rename_data(dat3, "wave3")
|
||||
df4 <- rename_data(dat4, "wave4")
|
||||
df5 <- rename_data(dat5, "wave5")
|
||||
df6 <- rename_data(dat6, "wave6")
|
||||
|
||||
# identical(dat1$tsk_typ_if_wrtng_gnr_1, df1$tsk_typ_if_wrtng_1)
|
||||
|
||||
|
||||
save_dataframe <- function(data, wave, path = "../03_cleaned_data/") {
|
||||
# as CSV
|
||||
write.csv(data,
|
||||
paste0(path, "/HMC_", wave, "_cleaned.csv"),
|
||||
row.names = FALSE)
|
||||
# # as XLSX
|
||||
# openxlsx::write.xlsx(data,
|
||||
# paste0(path, "/HMC_", wave, "_cleaned.xlsx"),
|
||||
# rowNames = FALSE)
|
||||
# # as SAV
|
||||
# haven::write_sav(data,
|
||||
# paste0(path, "/HMC_", wave, "_cleaned.sav"))
|
||||
}
|
||||
|
||||
# Save anonymized data frames
|
||||
save_dataframe(df1, "wave1")
|
||||
save_dataframe(df2, "wave2")
|
||||
save_dataframe(df3, "wave3")
|
||||
save_dataframe(df4, "wave4")
|
||||
save_dataframe(df5, "wave5")
|
||||
save_dataframe(df6, "wave6")
|
||||
|
||||
Reference in New Issue
Block a user