data/03_data/01_raw_data/anonymization.R

#' ---
#' title: "HMC anonymization"
#' author: "Nora Wickelmaier"
#' date: 2025-10-14
#' ---

# clear workspace
rm(list = ls())

#------------------------------------------------------------------------------
# Helper functions
anonymize_wave <- function(data, wave, sample = "sample1") {

  df <- data |>
    dplyr::filter(Status != 1) |>             # exclude Status not "Survey Preview"
    dplyr::filter(Finished != 0) |>           # exclude survey not finished
    dplyr::filter(informed_consent == 1) |>   # exclude did not give consent
    dplyr::filter(truth_answer_check == 1) |> # exclude did not provide honest answers
    dplyr::filter(consent_use_of_data == 1)   # exclude did not consent to data use

  cat("\nThe data for ", wave, " have ", nrow(df), " rows.\n\n")

  if (!all(df$participant_ID == df$PROLIFIC_PID, na.rm = TRUE)) {
    stop("Something is wrong with the IDs. Please check and rerun the script.")
  }

  if (length(df$participant_ID) > length(unique(df$participant_ID))) {
    warning("In ", wave, " there are observations with identical participant IDs. Make sure to remove them!")
  }

  if (any(df$participant_ID %in% "-99")) {
    warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.")
    df <- df[-which(df$participant_ID == "-99"), ]
  }

  if (any(is.na(df$participant_ID))) {
    warning("In ", wave, " there is at least one participant without participant ID. Since these cannot be integrated with the other waves, they were removed from the data set.")
    df <- df[!is.na(df$participant_ID), ]
  }

  if (wave == "wave1") {
    # New levels for anonymous ID variable
    levels_id <- paste0("subj",
                        sprintf("%04d", seq_along(unique(df$participant_ID))))

    # Creating new anonymous ID variable
    df$subj_id <- factor(df$participant_ID,
                         levels = unique(df$participant_ID),
                         labels = levels_id)

    # Create mapping file for the other waves
    id_mapping <- data.frame(
      original_id = df$participant_ID,
      subj_id = df$subj_id,
      stringsAsFactors = FALSE
    )

    write.csv(id_mapping, "id_mapping_key_wave1.csv", row.names = FALSE)
  } else {

    keys <- read.csv(paste0("id_mapping_key_wave",
                            as.numeric(substr(wave, 5, 5)) - 1, ".csv"))

    # Check if ID is already present in keys
    missing_IDs <- unique(df$participant_ID[!df$participant_ID %in% keys$original_id])
    if (length(missing_IDs) > 0) {
      start <- as.numeric(substr(tail(keys$subj_id, 1), 5, 8))
      new_IDs <- paste0("subj", sprintf("%04d", seq(start + 1,
                                                    start + length(missing_IDs))))

      # Add missing ID to keys
      keys_new <- rbind(keys,
                  data.frame(original_id = missing_IDs, subj_id = new_IDs))
    } else {
      keys_new <- keys
    }

    write.csv(keys_new, paste0("id_mapping_key_", wave, ".csv"),
              row.names = FALSE)

    # Creating new anonymous ID variable
    df$subj_id <- factor(df$participant_ID,
                         levels = keys_new$original_id,
                         labels = keys_new$subj_id)
  }

  # Remove old ID variables
  df_anonym <- df |>
    dplyr::select(-participant_ID, -PROLIFIC_PID)

  # Remove irrelevant Qualtrics variables (either empty or uninformative)
  df_anonym <- df_anonym |>
    dplyr::select(-IPAddress, -ResponseId, -RecipientLastName,
                  -RecipientFirstName, -RecipientEmail, -ExternalReference,
                  -LocationLatitude, -LocationLongitude, -DistributionChannel,
                  -UserLanguage, -RecordedDate, -Progress)

  if (wave %in% paste0("wave", 1:4)) {
  df_anonym <- df_anonym |>
    dplyr::select(-`time_recording1_First Click`, -`time_recording1_Last Click`,
                  -`time_recording1_Page Submit`, -`time_recording1_Click Count`)
  }

  if (wave %in% paste0("wave", 3:6)) {
  df_anonym <- df_anonym |>
    dplyr::select(-`Q166_First Click`, -`Q166_Last Click`,
                  -`Q166_Page Submit`, -`Q166_Click Count`)
  }


  # Rename variables that might cause trouble
  names(df_anonym) <- gsub("\\)", "", names(df_anonym))
  names(df_anonym) <- gsub("\\(", "", names(df_anonym))
  names(df_anonym) <- gsub(" ", "_", names(df_anonym))
  names(df_anonym) <- gsub("-", "_", names(df_anonym))
  names(df_anonym) <- gsub("\\#", "_", names(df_anonym))

  if (sample == "sample1") {
    df_anonym$sample <- "sample1"
  } else if (sample == "sample2") {
    df_anonym$sample <- "sample2"
  } else {
    stop("Cannot understand what you put as sample.")
  }

  # Add which wave it is to data frame
  df_anonym$wave <- wave

  # Return anonymized data frame
  #as.data.frame(df_anonym)
  df_anonym
}

save_dataframe <- function(data, wave, path = "../02_anonymized_data/") {
  # as CSV
  write.csv(data,
    paste0(path, "/HMC_", wave, "_anonymized.csv"),
           row.names = FALSE)
  # # as XLSX
  # openxlsx::write.xlsx(data,
  #   paste0(path, "/HMC_", wave, "_anonymized.xlsx"),
  #   rowNames = FALSE)
  # # as SAV
  # haven::write_sav(data,
  #   paste0(path, "/HMC_", wave, "_anonymized.sav"))
}
#------------------------------------------------------------------------------

# Read data for all waves
df_wave1 <- qualtRics::read_survey("wave1/AI+Trends_Wave1_August+4,+2025_10.03.csv")
df_wave2 <- qualtRics::read_survey("wave2/AI+Trends_Wave2_August+4,+2025_10.07.csv")
df_wave3 <- qualtRics::read_survey("wave3/AI+Trends_Wave3_August+4,+2025_10.07.csv")
df_wave4 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_August+4,+2025_10.08.csv")
df_wave4_sample2 <- qualtRics::read_survey("wave4/AI+Trends_Wave4_sample2_August+4,+2025_10.09.csv")
df_wave5 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_August+4,+2025_10.10.csv")
df_wave5_sample2 <- qualtRics::read_survey("wave5/AI+Trends_Wave5_sample2_August+4,+2025_10.11.csv")
df_wave6 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_August+4,+2025_10.11.csv")
df_wave6_sample2 <- qualtRics::read_survey("wave6/AI+Trends_Wave6_sample2_August+4,+2025_10.12.csv")

# Anonymize data for all waves
dat1 <- anonymize_wave(df_wave1, "wave1")
dat2 <- anonymize_wave(df_wave2, "wave2")
dat3 <- anonymize_wave(df_wave3, "wave3")
dat4 <- anonymize_wave(df_wave4, "wave4")
dat4_sample2 <- anonymize_wave(df_wave4_sample2, "wave4", "sample2")
dat5 <- anonymize_wave(df_wave5, "wave5")
dat5_sample2 <- anonymize_wave(df_wave5_sample2, "wave5", "sample2")
dat6 <- anonymize_wave(df_wave6, "wave6")
dat6_sample2 <- anonymize_wave(df_wave6_sample2, "wave6", "sample2")

# Combine sample 1 and sample 2 for waves 4 to 6
dat4 <- rbind(dat4, dat4_sample2)
dat5 <- rbind(dat5, dat5_sample2)
dat6 <- rbind(dat6, dat6_sample2)

# Save anonymized data frames
save_dataframe(dat1, "wave1")
save_dataframe(dat2, "wave2")
save_dataframe(dat3, "wave3")
save_dataframe(dat4, "wave4")
save_dataframe(dat5, "wave5")
save_dataframe(dat6, "wave6")