data/03_data/01_raw_data/anonymization.R

184 lines
6.9 KiB
R

#' ---
#' title: "HMC anonymization"
#' author: "Nora Wickelmaier"
#' date: 2025-10-14
#' ---
# clear workspace
rm(list = ls())
#------------------------------------------------------------------------------
# Helper functions
anonymize_wave <- function(data, wave, sample = "sample1") {
df <- data |>
dplyr::filter(Status != 1) |> # exclude Status not "Survey Preview"
dplyr::filter(Finished != 0) |> # exclude survey not finished
dplyr::filter(informed_consent == 1) |> # exclude did not give consent
dplyr::filter(truth_answer_check == 1) |> # exclude did not provide honest answers
dplyr::filter(consent_use_of_data == 1) # exclude did not consent to data use
cat("\nThe data for ", wave, " have ", nrow(df), " rows.\n\n")
if (!all(df$participant_ID == df$PROLIFIC_PID, na.rm = TRUE)) {
stop("Something is wrong with the IDs. Please check and rerun the script.")
}
if (length(df$participant_ID) > length(unique(df$participant_ID))) {
warning("In ", wave, " there are observations with identical participant IDs. Make sure to remove them!")
}
if (any(df$participant_ID %in% "-99")) {
warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.")
df <- df[-which(df$participant_ID == "-99"), ]
}
if (any(is.na(df$participant_ID))) {
warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.")
df <- df[!is.na(df$participant_ID), ]
}
if (wave == "wave1") {
# New levels for anonymous ID variable
levels_id <- paste0("subj",
sprintf("%04d", seq_along(unique(df$participant_ID))))
# Creating new anonymous ID variable
df$subj_id <- factor(df$participant_ID,
levels = unique(df$participant_ID),
labels = levels_id)
# Create mapping file for the other waves
id_mapping <- data.frame(
original_id = df$participant_ID,
subj_id = df$subj_id,
stringsAsFactors = FALSE
)
write.csv(id_mapping, "id_mapping_key_wave1.csv", row.names = FALSE)
} else {
keys <- read.csv(paste0("id_mapping_key_wave",
as.numeric(substr(wave, 5, 5)) - 1, ".csv"))
# Check if ID is already present in keys
missing_IDs <- unique(df$participant_ID[!df$participant_ID %in% keys$original_id])
if (length(missing_IDs) > 0) {
start <- as.numeric(substr(tail(keys$subj_id, 1), 5, 8))
new_IDs <- paste0("subj", sprintf("%04d", seq(start + 1,
start + length(missing_IDs))))
# Add missing ID to keys
keys_new <- rbind(keys,
data.frame(original_id = missing_IDs, subj_id = new_IDs))
} else {
keys_new <- keys
}
write.csv(keys_new, paste0("id_mapping_key_", wave, ".csv"),
row.names = FALSE)
# Creating new anonymous ID variable
df$subj_id <- factor(df$participant_ID,
levels = keys_new$original_id,
labels = keys_new$subj_id)
}
# Remove old ID variables
df_anonym <- df |>
dplyr::select(-participant_ID, -PROLIFIC_PID)
# Remove irrelevant Qualtrics variables (either empty or uninformative)
df_anonym <- df_anonym |>
dplyr::select(-IPAddress, -ResponseId, -RecipientLastName,
-RecipientFirstName, -RecipientEmail, -ExternalReference,
-LocationLatitude, -LocationLongitude, -DistributionChannel,
-UserLanguage, -RecordedDate, -Progress)
if (wave %in% paste0("wave", 1:4)) {
df_anonym <- df_anonym |>
dplyr::select(-`time_recording1_First Click`, -`time_recording1_Last Click`,
-`time_recording1_Page Submit`, -`time_recording1_Click Count`)
}
if (wave %in% paste0("wave", 3:6)) {
df_anonym <- df_anonym |>
dplyr::select(-`Q166_First Click`, -`Q166_Last Click`,
-`Q166_Page Submit`, -`Q166_Click Count`)
}
# Rename variables that might cause trouble
names(df_anonym) <- gsub("\\)", "", names(df_anonym))
names(df_anonym) <- gsub("\\(", "", names(df_anonym))
names(df_anonym) <- gsub(" ", "_", names(df_anonym))
names(df_anonym) <- gsub("-", "_", names(df_anonym))
names(df_anonym) <- gsub("\\#", "_", names(df_anonym))
if (sample == "sample1") {
df_anonym$sample <- "sample1"
} else if (sample == "sample2") {
df_anonym$sample <- "sample2"
} else {
stop("Cannot understand what you put as sample.")
}
# Add which wave it is to data frame
df_anonym$wave <- wave
# Return anonymized data frame
#as.data.frame(df_anonym)
df_anonym
}
save_dataframe <- function(data, wave, path = "../02_anonymized_data/") {
# as CSV
write.csv(data,
paste0(path, "/HMC_", wave, "_anonymized.csv"),
row.names = FALSE)
# # as XLSX
# openxlsx::write.xlsx(data,
# paste0(path, "/HMC_", wave, "_anonymized.xlsx"),
# rowNames = FALSE)
# # as SAV
# haven::write_sav(data,
# paste0(path, "/HMC_", wave, "_anonymized.sav"))
}
#------------------------------------------------------------------------------
# Read data for all waves
df_wave1 <- qualtRics::read_survey("wave1/AI+Trends_Wave1_August+4,+2025_10.03.csv")
df_wave2 <- qualtRics::read_survey("wave2/AI+Trends_Wave2_August+4,+2025_10.07.csv")
df_wave3 <- qualtRics::read_survey("wave3/AI+Trends_Wave3_August+4,+2025_10.07.csv")
df_wave4 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_August+4,+2025_10.08.csv")
df_wave4_sample2 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_sample2_August+4,+2025_10.09.csv")
df_wave5 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_August+4,+2025_10.10.csv")
df_wave5_sample2 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_sample2_August+4,+2025_10.11.csv")
df_wave6 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_August+4,+2025_10.11.csv")
df_wave6_sample2 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_sample2_August+4,+2025_10.12.csv")
# Anonymize data for all waves
dat1 <- anonymize_wave(df_wave1, "wave1")
dat2 <- anonymize_wave(df_wave2, "wave2")
dat3 <- anonymize_wave(df_wave3, "wave3")
dat4 <- anonymize_wave(df_wave4, "wave4")
dat4_sample2 <- anonymize_wave(df_wave4_sample2, "wave4", "sample2")
dat5 <- anonymize_wave(df_wave5, "wave5")
dat5_sample2 <- anonymize_wave(df_wave5_sample2, "wave5", "sample2")
dat6 <- anonymize_wave(df_wave6, "wave6")
dat6_sample2 <- anonymize_wave(df_wave6_sample2, "wave6", "sample2")
# Combine sample 1 and sample 2 for waves 4 to 6
dat4 <- rbind(dat4, dat4_sample2)
dat5 <- rbind(dat5, dat5_sample2)
dat6 <- rbind(dat6, dat6_sample2)
# Save anonymized data frames
save_dataframe(dat1, "wave1")
save_dataframe(dat2, "wave2")
save_dataframe(dat3, "wave3")
save_dataframe(dat4, "wave4")
save_dataframe(dat5, "wave5")
save_dataframe(dat6, "wave6")