data_management/03_data_organisation/example/code/01_preprocessing.R

79 lines
3.0 KiB
R
Raw Normal View History

2024-06-07 13:47:03 +02:00
# 01_preprocessing.R
#
# Cleaning up data for toy data set Methods Seminar SS2024
#
# Input: RDM_MS_SS2024_download_2024-06-07.csv
# Output: results/data_rdm-ms-ss2024_cleaned.csv
# results/data_rdm-ms-ss2024_cleaned.RData
#
# created: 2024-06-03
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/")
dat <- read.table("data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv",
sep = ",", skip = 3, stringsAsFactors = TRUE, na.string = "")
names(dat) <-
readLines("data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv", 1) |>
strsplit(split = ",") |>
unlist()
# Clean up variables
dat$ResponseId <- factor(dat$ResponseId)
dat$sex <- factor(dat$sex,
levels = c("m", "f", "d", "not indicated"))
dat$data_sharing_1 <- factor(dat$data_sharing_1,
levels = c("No", "Yes"))
dat$career_level_1 <- factor(dat$career_level_1,
levels = c("Student", "PhD student", "Postdoc",
"Senior researcher", "Professor",
"Other"))
dat$rdm_stmnt_1 <- factor(dat$rdm_stmnt_1,
levels = c("Strongly disagree", "Disagree",
"Neither agree nor disagree", "Agree",
"Strongly agree"))
dat$rdm_stmnt_2 <- factor(dat$rdm_stmnt_2,
levels = c("Strongly disagree", "Disagree",
"Neither agree nor disagree", "Agree",
"Strongly agree"))
dat$rdm_stmnt_3 <- factor(dat$rdm_stmnt_3,
levels = c("Strongly disagree", "Disagree",
"Neither agree nor disagree", "Agree",
"Strongly agree"))
dat$rdm_stmnt_4 <- factor(dat$rdm_stmnt_4,
levels = c("Strongly disagree", "Disagree",
"Neither agree nor disagree", "Agree",
"Strongly agree"))
dat$rdm_stmnt_5 <- factor(dat$rdm_stmnt_5,
levels = c("Strongly disagree", "Disagree",
"Neither agree nor disagree", "Agree",
"Strongly agree"))
## Fix data_sharing_2
dat$data_sharing_2[dat$data_sharing_2 == "1 out of 4"] <- 1
dat$data_sharing_2 <- as.numeric(dat$data_sharing_2)
# Create numeric statement variables
dat$rdm_stmnt_1 <- as.numeric(dat$rdm_stmnt_1)
dat$rdm_stmnt_2 <- as.numeric(dat$rdm_stmnt_2)
dat$rdm_stmnt_3 <- as.numeric(dat$rdm_stmnt_3)
dat$rdm_stmnt_4 <- as.numeric(dat$rdm_stmnt_5)
dat$rdm_stmnt_4 <- as.numeric(dat$rdm_stmnt_4)
dat$rdm_stmnt_5 <- as.numeric(dat$rdm_stmnt_5)
# Save cleaned data set
write.table(dat, file = "data/results/data_rdm-ms-ss2024_cleaned.csv", sep = ";",
row.names = FALSE, quote = FALSE)
save(dat, file = "data/results/data_rdm-ms-ss2024_cleaned.RData")