184 lines
6.9 KiB
R
184 lines
6.9 KiB
R
#' ---
|
|
#' title: "HMC anonymization"
|
|
#' author: "Nora Wickelmaier"
|
|
#' date: 2025-10-14
|
|
#' ---
|
|
|
|
# clear workspace
|
|
rm(list = ls())
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Helper functions
|
|
anonymize_wave <- function(data, wave, sample = "sample1") {
|
|
|
|
df <- data |>
|
|
dplyr::filter(Status != 1) |> # exclude Status not "Survey Preview"
|
|
#dplyr::filter(Finished != 0) |> # exclude survey not finished
|
|
dplyr::filter(informed_consent == 1) |> # exclude did not give consent
|
|
dplyr::filter(truth_answer_check == 1) |> # exclude did not provide honest answers
|
|
dplyr::filter(consent_use_of_data == 1) # exclude did not consent to data use
|
|
|
|
cat("\nThe data for ", wave, " have ", nrow(df), " rows.\n\n")
|
|
|
|
if (!all(df$participant_ID == df$PROLIFIC_PID, na.rm = TRUE)) {
|
|
stop("Something is wrong with the IDs. Please check and rerun the script.")
|
|
}
|
|
|
|
if (length(df$participant_ID) > length(unique(df$participant_ID))) {
|
|
warning("In ", wave, " there are observations with identical participant IDs. Make sure to remove them!")
|
|
}
|
|
|
|
if (any(df$participant_ID %in% "-99")) {
|
|
warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.")
|
|
df <- df[-which(df$participant_ID == "-99"), ]
|
|
}
|
|
|
|
if (any(is.na(df$participant_ID))) {
|
|
warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.")
|
|
df <- df[!is.na(df$participant_ID), ]
|
|
}
|
|
|
|
if (wave == "wave1") {
|
|
# New levels for anonymous ID variable
|
|
levels_id <- paste0("subj",
|
|
sprintf("%04d", seq_along(unique(df$participant_ID))))
|
|
|
|
# Creating new anonymous ID variable
|
|
df$subj_id <- factor(df$participant_ID,
|
|
levels = unique(df$participant_ID),
|
|
labels = levels_id)
|
|
|
|
# Create mapping file for the other waves
|
|
id_mapping <- data.frame(
|
|
original_id = df$participant_ID,
|
|
subj_id = df$subj_id,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
|
|
write.csv(id_mapping, "id_mapping_key_wave1.csv", row.names = FALSE)
|
|
} else {
|
|
|
|
keys <- read.csv(paste0("id_mapping_key_wave",
|
|
as.numeric(substr(wave, 5, 5)) - 1, ".csv"))
|
|
|
|
# Check if ID is already present in keys
|
|
missing_IDs <- unique(df$participant_ID[!df$participant_ID %in% keys$original_id])
|
|
if (length(missing_IDs) > 0) {
|
|
start <- as.numeric(substr(tail(keys$subj_id, 1), 5, 8))
|
|
new_IDs <- paste0("subj", sprintf("%04d", seq(start + 1,
|
|
start + length(missing_IDs))))
|
|
|
|
# Add missing ID to keys
|
|
keys_new <- rbind(keys,
|
|
data.frame(original_id = missing_IDs, subj_id = new_IDs))
|
|
} else {
|
|
keys_new <- keys
|
|
}
|
|
|
|
write.csv(keys_new, paste0("id_mapping_key_", wave, ".csv"),
|
|
row.names = FALSE)
|
|
|
|
# Creating new anonymous ID variable
|
|
df$subj_id <- factor(df$participant_ID,
|
|
levels = keys_new$original_id,
|
|
labels = keys_new$subj_id)
|
|
}
|
|
|
|
# Remove old ID variables
|
|
df_anonym <- df |>
|
|
dplyr::select(-participant_ID, -PROLIFIC_PID)
|
|
|
|
# Remove irrelevant Qualtrics variables (either empty or uninformative)
|
|
df_anonym <- df_anonym |>
|
|
dplyr::select(-IPAddress, -ResponseId, -RecipientLastName,
|
|
-RecipientFirstName, -RecipientEmail, -ExternalReference,
|
|
-LocationLatitude, -LocationLongitude, -DistributionChannel,
|
|
-UserLanguage, -RecordedDate, -Progress)
|
|
|
|
if (wave %in% paste0("wave", 1:4)) {
|
|
df_anonym <- df_anonym |>
|
|
dplyr::select(-`time_recording1_First Click`, -`time_recording1_Last Click`,
|
|
-`time_recording1_Page Submit`, -`time_recording1_Click Count`)
|
|
}
|
|
|
|
if (wave %in% paste0("wave", 3:6)) {
|
|
df_anonym <- df_anonym |>
|
|
dplyr::select(-`Q166_First Click`, -`Q166_Last Click`,
|
|
-`Q166_Page Submit`, -`Q166_Click Count`)
|
|
}
|
|
|
|
|
|
# Rename variables that might cause trouble
|
|
names(df_anonym) <- gsub("\\)", "", names(df_anonym))
|
|
names(df_anonym) <- gsub("\\(", "", names(df_anonym))
|
|
names(df_anonym) <- gsub(" ", "_", names(df_anonym))
|
|
names(df_anonym) <- gsub("-", "_", names(df_anonym))
|
|
names(df_anonym) <- gsub("\\#", "_", names(df_anonym))
|
|
|
|
if (sample == "sample1") {
|
|
df_anonym$sample <- "sample1"
|
|
} else if (sample == "sample2") {
|
|
df_anonym$sample <- "sample2"
|
|
} else {
|
|
stop("Cannot understand what you put as sample.")
|
|
}
|
|
|
|
# Add which wave it is to data frame
|
|
df_anonym$wave <- wave
|
|
|
|
# Return anonymized data frame
|
|
#as.data.frame(df_anonym)
|
|
df_anonym
|
|
}
|
|
|
|
save_dataframe <- function(data, wave, path = "../02_anonymized_data/") {
|
|
# as CSV
|
|
write.csv(data,
|
|
paste0(path, "/HMC_", wave, "_anonymized.csv"),
|
|
row.names = FALSE)
|
|
# # as XLSX
|
|
# openxlsx::write.xlsx(data,
|
|
# paste0(path, "/HMC_", wave, "_anonymized.xlsx"),
|
|
# rowNames = FALSE)
|
|
# # as SAV
|
|
# haven::write_sav(data,
|
|
# paste0(path, "/HMC_", wave, "_anonymized.sav"))
|
|
}
|
|
#------------------------------------------------------------------------------
|
|
|
|
# Read data for all waves
|
|
df_wave1 <- qualtRics::read_survey("wave1/AI+Trends_Wave1_August+4,+2025_10.03.csv")
|
|
df_wave2 <- qualtRics::read_survey("wave2/AI+Trends_Wave2_August+4,+2025_10.07.csv")
|
|
df_wave3 <- qualtRics::read_survey("wave3/AI+Trends_Wave3_August+4,+2025_10.07.csv")
|
|
df_wave4 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_August+4,+2025_10.08.csv")
|
|
df_wave4_sample2 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_sample2_August+4,+2025_10.09.csv")
|
|
df_wave5 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_August+4,+2025_10.10.csv")
|
|
df_wave5_sample2 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_sample2_August+4,+2025_10.11.csv")
|
|
df_wave6 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_August+4,+2025_10.11.csv")
|
|
df_wave6_sample2 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_sample2_August+4,+2025_10.12.csv")
|
|
|
|
# Anonymize data for all waves
|
|
dat1 <- anonymize_wave(df_wave1, "wave1")
|
|
dat2 <- anonymize_wave(df_wave2, "wave2")
|
|
dat3 <- anonymize_wave(df_wave3, "wave3")
|
|
dat4 <- anonymize_wave(df_wave4, "wave4")
|
|
dat4_sample2 <- anonymize_wave(df_wave4_sample2, "wave4", "sample2")
|
|
dat5 <- anonymize_wave(df_wave5, "wave5")
|
|
dat5_sample2 <- anonymize_wave(df_wave5_sample2, "wave5", "sample2")
|
|
dat6 <- anonymize_wave(df_wave6, "wave6")
|
|
dat6_sample2 <- anonymize_wave(df_wave6_sample2, "wave6", "sample2")
|
|
|
|
# Combine sample 1 and sample 2 for waves 4 to 6
|
|
dat4 <- rbind(dat4, dat4_sample2)
|
|
dat5 <- rbind(dat5, dat5_sample2)
|
|
dat6 <- rbind(dat6, dat6_sample2)
|
|
|
|
# Save anonymized data frames
|
|
save_dataframe(dat1, "wave1")
|
|
save_dataframe(dat2, "wave2")
|
|
save_dataframe(dat3, "wave3")
|
|
save_dataframe(dat4, "wave4")
|
|
save_dataframe(dat5, "wave5")
|
|
save_dataframe(dat6, "wave6")
|
|
|