#' --- #' title: "HMC anonymization" #' author: "Nora Wickelmaier" #' date: 2025-10-14 #' --- # clear workspace rm(list = ls()) #------------------------------------------------------------------------------ # Helper functions anonymize_wave <- function(data, wave, sample = "sample1") { df <- data |> dplyr::filter(Status != 1) |> # exclude Status not "Survey Preview" dplyr::filter(Finished != 0) |> # exclude survey not finished dplyr::filter(informed_consent == 1) |> # exclude did not give consent dplyr::filter(truth_answer_check == 1) |> # exclude did not provide honest answers dplyr::filter(consent_use_of_data == 1) # exclude did not consent to data use cat("\nThe data for ", wave, " have ", nrow(df), " rows.\n\n") if (!all(df$participant_ID == df$PROLIFIC_PID, na.rm = TRUE)) { stop("Something is wrong with the IDs. Please check and rerun the script.") } if (length(df$participant_ID) > length(unique(df$participant_ID))) { warning("In ", wave, " there are observations with identical participant IDs. Make sure to remove them!") } if (any(df$participant_ID %in% "-99")) { warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.") df <- df[-which(df$participant_ID == "-99"), ] } if (any(is.na(df$participant_ID))) { warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.") df <- df[!is.na(df$participant_ID), ] } if (wave == "wave1") { # New levels for anonymous ID variable levels_id <- paste0("subj", sprintf("%04d", seq_along(unique(df$participant_ID)))) # Creating new anonymous ID variable df$subj_id <- factor(df$participant_ID, levels = unique(df$participant_ID), labels = levels_id) # Create mapping file for the other waves id_mapping <- data.frame( original_id = df$participant_ID, subj_id = df$subj_id, stringsAsFactors = FALSE ) write.csv(id_mapping, "id_mapping_key_wave1.csv", row.names = FALSE) } else { keys <- read.csv(paste0("id_mapping_key_wave", as.numeric(substr(wave, 5, 5)) - 1, ".csv")) # Check if ID is already present in keys missing_IDs <- unique(df$participant_ID[!df$participant_ID %in% keys$original_id]) if (length(missing_IDs) > 0) { start <- as.numeric(substr(tail(keys$subj_id, 1), 5, 8)) new_IDs <- paste0("subj", sprintf("%04d", seq(start + 1, start + length(missing_IDs)))) # Add missing ID to keys keys_new <- rbind(keys, data.frame(original_id = missing_IDs, subj_id = new_IDs)) } else { keys_new <- keys } write.csv(keys_new, paste0("id_mapping_key_", wave, ".csv"), row.names = FALSE) # Creating new anonymous ID variable df$subj_id <- factor(df$participant_ID, levels = keys_new$original_id, labels = keys_new$subj_id) } # Remove old ID variables df_anonym <- df |> dplyr::select(-participant_ID, -PROLIFIC_PID) # Remove irrelevant Qualtrics variables (either empty or uninformative) df_anonym <- df_anonym |> dplyr::select(-IPAddress, -ResponseId, -RecipientLastName, -RecipientFirstName, -RecipientEmail, -ExternalReference, -LocationLatitude, -LocationLongitude, -DistributionChannel, -UserLanguage, -RecordedDate, -Progress) if (wave %in% paste0("wave", 1:4)) { df_anonym <- df_anonym |> dplyr::select(-`time_recording1_First Click`, -`time_recording1_Last Click`, -`time_recording1_Page Submit`, -`time_recording1_Click Count`) } if (wave %in% paste0("wave", 3:6)) { df_anonym <- df_anonym |> dplyr::select(-`Q166_First Click`, -`Q166_Last Click`, -`Q166_Page Submit`, -`Q166_Click Count`) } # Rename variables that might cause trouble names(df_anonym) <- gsub("\\)", "", names(df_anonym)) names(df_anonym) <- gsub("\\(", "", names(df_anonym)) names(df_anonym) <- gsub(" ", "_", names(df_anonym)) names(df_anonym) <- gsub("-", "_", names(df_anonym)) names(df_anonym) <- gsub("\\#", "_", names(df_anonym)) if (sample == "sample1") { df_anonym$sample <- "sample1" } else if (sample == "sample2") { df_anonym$sample <- "sample2" } else { stop("Cannot understand what you put as sample.") } # Add which wave it is to data frame df_anonym$wave <- wave # Return anonymized data frame #as.data.frame(df_anonym) df_anonym } save_dataframe <- function(data, wave, path = "../02_anonymized_data/") { # as CSV write.csv(data, paste0(path, "/HMC_", wave, "_anonymized.csv"), row.names = FALSE) # # as XLSX # openxlsx::write.xlsx(data, # paste0(path, "/HMC_", wave, "_anonymized.xlsx"), # rowNames = FALSE) # # as SAV # haven::write_sav(data, # paste0(path, "/HMC_", wave, "_anonymized.sav")) } #------------------------------------------------------------------------------ # Read data for all waves df_wave1 <- qualtRics::read_survey("wave1/AI+Trends_Wave1_August+4,+2025_10.03.csv") df_wave2 <- qualtRics::read_survey("wave2/AI+Trends_Wave2_August+4,+2025_10.07.csv") df_wave3 <- qualtRics::read_survey("wave3/AI+Trends_Wave3_August+4,+2025_10.07.csv") df_wave4 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_August+4,+2025_10.08.csv") df_wave4_sample2 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_sample2_August+4,+2025_10.09.csv") df_wave5 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_August+4,+2025_10.10.csv") df_wave5_sample2 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_sample2_August+4,+2025_10.11.csv") df_wave6 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_August+4,+2025_10.11.csv") df_wave6_sample2 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_sample2_August+4,+2025_10.12.csv") # Anonymize data for all waves dat1 <- anonymize_wave(df_wave1, "wave1") dat2 <- anonymize_wave(df_wave2, "wave2") dat3 <- anonymize_wave(df_wave3, "wave3") dat4 <- anonymize_wave(df_wave4, "wave4") dat4_sample2 <- anonymize_wave(df_wave4_sample2, "wave4", "sample2") dat5 <- anonymize_wave(df_wave5, "wave5") dat5_sample2 <- anonymize_wave(df_wave5_sample2, "wave5", "sample2") dat6 <- anonymize_wave(df_wave6, "wave6") dat6_sample2 <- anonymize_wave(df_wave6_sample2, "wave6", "sample2") # Combine sample 1 and sample 2 for waves 4 to 6 dat4 <- rbind(dat4, dat4_sample2) dat5 <- rbind(dat5, dat5_sample2) dat6 <- rbind(dat6, dat6_sample2) # Save anonymized data frames save_dataframe(dat1, "wave1") save_dataframe(dat2, "wave2") save_dataframe(dat3, "wave3") save_dataframe(dat4, "wave4") save_dataframe(dat5, "wave5") save_dataframe(dat6, "wave6")