diff --git a/03_data_organisation/03_data_organisation.tex b/03_data_organisation/03_data_organisation.tex new file mode 100644 index 0000000..55b231e --- /dev/null +++ b/03_data_organisation/03_data_organisation.tex @@ -0,0 +1,844 @@ +\documentclass[aspectratio=169]{beamer} + +\usepackage{listings} +\usepackage[utf8,latin1]{inputenc} +\usepackage[style = apa, backend = biber, natbib = true]{biblatex} +\addbibresource{../literature/lit.bib} + +\usepackage{fancyvrb} +\usepackage{fontawesome5} % get icons +\usepackage{multirow} +\usepackage{color, colortbl} + +\usepackage{tikz} +\usetikzlibrary{fit} +\usepackage[edges]{forest} + +\lstset{language=R,% + backgroundcolor=\color{iwmgray!15!white}, + basicstyle=\ttfamily\color{iwmgray}, + frame=none, + commentstyle=\slshape\color{iwmgreen}, + keywordstyle=\bfseries\color{iwmgray}, + identifierstyle=\color{iwmpurple}, + stringstyle=\color{iwmblue}, + numbers=none,%left,numberstyle=\tiny, + basewidth={.5em, .4em}, + showstringspaces=false, + emphstyle=\color{red!50!white}} + +\makeatletter \def\newblock{\beamer@newblock} \makeatother + +\beamertemplatenavigationsymbolsempty +\setbeamertemplate{itemize items}[circle] +\setbeamertemplate{section in toc}[circle] +\mode{\setbeamercolor{math text displayed}{fg=iwmgray}} +\setbeamercolor{block body}{bg=iwmorange!50!white} +\setbeamercolor{block title}{fg=white, bg=iwmorange} +% Definitions for biblatex +\setbeamercolor{bibliography entry note}{fg=iwmgray} +\setbeamercolor{bibliography entry author}{fg=iwmgray} +\setbeamertemplate{bibliography item}{} + +\definecolor{iwmorange}{RGB}{255,105,0} +\definecolor{iwmgray}{RGB}{67,79,79} +\definecolor{iwmblue}{RGB}{60,180,220} +\definecolor{iwmgreen}{RGB}{145,200,110} +\definecolor{iwmpurple}{RGB}{120,0,75} + +\setbeamercolor{title}{fg=iwmorange} +\setbeamercolor{frametitle}{fg=iwmorange} +\setbeamercolor{structure}{fg=iwmorange} +\setbeamercolor{normal text}{fg=iwmgray} +\setbeamercolor{author}{fg=iwmgray} +\setbeamercolor{date}{fg=iwmgray} + +\newcommand{\vect}[1]{\mathbf{#1}} +\newcommand{\mat}[1]{\mathbf{#1}} +\newcommand{\gvect}[1]{\boldsymbol{#1}} +\newcommand{\gmat}[1]{\boldsymbol{#1}} + +\AtBeginSection[]{ + \frame{ + \tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}} + +\setbeamertemplate{headline}{ + \begin{beamercolorbox}{section in head} + \vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt + \end{beamercolorbox} +} + +\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt} + +\title{Data organisation for effective research data management} +\author{Nora Wickelmaier} +\date{June 10, 2024} + +\begin{document} + +\begin{frame}{} +\thispagestyle{empty} +\titlepage +\end{frame} + +\begin{frame}{Data request} + \begin{center} + \includegraphics[scale = .55]{../figures/email_data_request_2024_01} + \end{center} +\end{frame} + +\begin{frame}{Data folder for the data requested} + \begin{center} + \includegraphics[scale = .6]{../figures/email_data_request_2024_03} + \end{center} +\end{frame} + +\begin{frame}{What is bad about this data organisation?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 3} + + \url{https://app.sli.do/event/3S1Bn3Tjknuk5J5WiqAYzG} +\end{frame} + +\begin{frame}[<+->]{Bad things about this data organisation} + \begin{itemize} + \item Raw and processed data are in the same folder + \item File naming does not sort in a sensible way: Best order would be first + by subject, then by session + \item Data and data scripts are in the same folder + \item Data scripts are not numbered, unclear in what order they need to be + executed + \item There are plot files (PDFs) between the data and code files + \item It is unclear which are the final and processed data files + \item The final data files are not stored in an interoperable format: There + is only an \texttt{.RData} file that (probably) contains the final data + which was used for further analyses + \item There is no documentation whatsoever + \item \dots + \end{itemize} +\end{frame} + +\begin{frame}{Topics for this semester} +\centering +\begin{tabular}{ll} +\hline +Date & Topic \\ +\hline +2024-05-13 & Introduction to data management \\ +2024-05-27 & Workflow \\ +\only<1>{2024-06-10}\only<2>{\bf 2024-06-10} & \only<1>{Data organisation}\only<2>{\bf Data organisation}\\ +2024-06-24 & Data sharing \\ +2024-07-08 & Clean coding \\ +2024-07-22 & Version control \\ +\hline +\end{tabular} +\end{frame} + +% * different data sources +% * content README file +% * best arrangement of data +% * redundancy +% * anonymizing/pseudonymizing data + +\section{Folder organisation} + +\begin{frame}[<+->]{Some general rules} + \begin{itemize} + \item One project, one folder + \item Add README file at top level + \item Raw data are in a separate folder (and stay separate!) + \item Have a code folder + \item It is often a good idea to separate your data analysis from papers, + talks, etc. (especially if you want to publish your data) + \item Have designated folders where stuff is written to (e.\,g., + \texttt{results}, \texttt{figures}, \texttt{processed}, etc.) + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Folder organisation} + {One possible example!} + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+}; + % first level + \node (n1) at (4, 0) {\faIcon{folder} \verb+admin+}; + \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+}; + \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+}; + \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+}; + \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (top.east) -- (n1.west); + \path (top.east) -- (n2.west); + \path (top.east) -- (n3.west); + \path (top.east) -- (file.west); + \end{tikzpicture} + \vfill +\end{frame} + +\begin{frame}[fragile]{Folder organisation} + {Analysis folder} + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+}; + % first level + \node (n1) at (4, 0) {\faIcon{folder} \verb+admin+}; + \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+}; + \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+}; + \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+}; + \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (top.east) -- (n1.west); + \path (top.east) -- (n2.west); + \path (top.east) -- (n3.west); + \path (top.east) -- (file.west); + % second level + \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+}; + \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+}; + \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+}; + \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+}; + \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (n2.east) -- (o1.west); + \path (n2.east) -- (o2.west); + \path (n2.east) -- (o3.west); + \path (n2.east) -- (o4.west); + \path (n2.east) -- (o5.west); + \end{tikzpicture} + \vfill +\end{frame} + +\begin{frame}[fragile]{Folder organisation} + {Analysis folder} + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+}; + % first level + \node (n1) at (4, 0) {\faIcon{folder} \verb+admin+}; + \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+}; + \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+}; + \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+}; + \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (top.east) -- (n1.west); + \path (top.east) -- (n2.west); + \path (top.east) -- (n3.west); + \path (top.east) -- (file.west); + % second level + \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+}; + \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+}; + \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+}; + \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+}; + \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (n2.east) -- (o1.west); + \path (n2.east) -- (o2.west); + \path (n2.east) -- (o3.west); + \path (n2.east) -- (o4.west); + \path (n2.east) -- (o5.west); + % third level + \node[text width = 5cm] (p1) at (12, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+}; + \node[text width = 5cm] (p2) at (12, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+}; + \node[text width = 5cm] (p3) at (12, -1.4) {\faIcon[regular]{file} \verb+03_modeling.R+}; + \node[text width = 5cm] (p4) at (12, -2.1) {\faIcon[regular]{file} \verb+04_plots.R+}; + \path (o1.center) -- (p1.west); + \path (o1.center) -- (p2.west); + \path (o1.center) -- (p3.west); + \path (o1.center) -- (p4.west); + \end{tikzpicture} + \vfill +\end{frame} + +\begin{frame}[fragile]{Folder organisation} + {Analysis folder} + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+}; + % first level + \node (n1) at (4, 0) {\faIcon{folder} \verb+admin+}; + \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+}; + \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+}; + \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+}; + \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (top.east) -- (n1.west); + \path (top.east) -- (n2.west); + \path (top.east) -- (n3.west); + \path (top.east) -- (file.west); + % second level + \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+}; + \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+}; + \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+}; + \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+}; + \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (n2.east) -- (o1.west); + \path (n2.east) -- (o2.west); + \path (n2.east) -- (o3.west); + \path (n2.east) -- (o4.west); + \path (n2.east) -- (o5.west); + % third level + \node[text width = 5cm] (p1) at (12, 0) {\faIcon[regular]{file} \verb+subj1_ses01.txt+}; + \node[text width = 5cm] (p2) at (12, -0.7) {\faIcon[regular]{file} \verb+subj1_ses02.txt+}; + \node[text width = 5cm] (p3) at (12, -1.4) {\faIcon[regular]{file} \verb+subj2_ses01.txt+}; + \node[text width = 5cm] (p4) at (12, -2.1) {\faIcon[regular]{file} \verb+subj2_ses02.txt+}; + \node[text width = 5cm] (p5) at (12, -2.8) {\faIcon[regular]{file} \dots}; + \path (o2.center) -- (p1.west); + \path (o2.center) -- (p2.west); + \path (o2.center) -- (p3.west); + \path (o2.center) -- (p4.west); + \path (o2.center) -- (p5.west); + \end{tikzpicture} + \vfill +\end{frame} + +\begin{frame}[fragile]{Folder organisation} + {Analysis folder} + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+}; + % first level + \node (n1) at (4, 0) {\faIcon{folder} \verb+admin+}; + \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+}; + \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+}; + \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+}; + \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (top.east) -- (n1.west); + \path (top.east) -- (n2.west); + \path (top.east) -- (n3.west); + \path (top.east) -- (file.west); + % second level + \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+}; + \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+}; + \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+}; + \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+}; + \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (n2.east) -- (o1.west); + \path (n2.east) -- (o2.west); + \path (n2.east) -- (o3.west); + \path (n2.east) -- (o4.west); + \path (n2.east) -- (o5.west); + % third level + \node[text width = 5cm] (p1) at (12, -0.7) {\faIcon[regular]{file} + \verb+data_all-subj.csv+}; + \node[text width = 5cm] (p2) at (12, -1.4) {\faIcon[regular]{file} + \verb+data_all-subj.RData+}; + \node[text width = 5cm] (p3) at (12, -2.1) {\faIcon[regular]{file} + \verb+eval_model1.csv+}; + \node[text width = 5cm] (p4) at (12, -2.8) {\faIcon[regular]{file} + \verb+eval_model2.csv+}; + \path (o4.center) -- (p1.west); + \path (o4.center) -- (p2.west); + \path (o4.center) -- (p3.west); + \path (o4.center) -- (p4.west); + \end{tikzpicture} + \vfill + \pause + The analysis folder you might want to share on OSF, Github, etc. +\end{frame} + +\begin{frame}[fragile]{Folder organisation} + {Dissemination folder} + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+}; + % first level + \node (n1) at (4, 0) {\faIcon{folder} \verb+admin+}; + \node (n2) at (4, -0.7) {\faIcon{folder} \verb+analysis+}; + \node[text width = 3.2cm] (n3) at (3.6, -1.4) {\faIcon{folder} \verb+dissemination+}; + \node (n4) at (4, -2.1) {\faIcon{folder} \verb+material+}; + \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (top.east) -- (n1.west); + \path (top.east) -- (n2.west); + \path (top.east) -- (n3.west); + \path (top.east) -- (file.west); + % second level + \node (o1) at (8.5, 0) {\faIcon{folder} \verb+paper+}; + \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+talks+}; + \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+}; + \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+}; + \node (o5) at (8.5, -2.8) {\faIcon{folder} \verb+tables+}; + \path (n3.east) -- (o1.west); + \path (n3.east) -- (o2.west); + \path (n3.east) -- (o3.west); + \path (n3.east) -- (o4.west); + \path (n3.east) -- (o5.west); + \end{tikzpicture} + \vfill + \pause + Having separate folders for figures and tables helps you keep track of them + for your paper and talks +\end{frame} + +\begin{frame}[fragile]{Figures and tables} + \begin{itemize} + \item Most of us (including me!) are not at a stage where we are + writing our papers or talks as reproducible documents + \pause + \item It is still a good idea to create tables and figures in R and keep the + code easily accessible + \pause + \item One suggestion + + \begin{tikzpicture}[ + every node/.style = {text width = 4.2cm, align = left}, + every path/.style = {thick, draw} + ] + % figures + \node (fig) at (0, 0) {\faIcon{folder} \verb+figures+}; + \node (n1) at (4, 0) {\faIcon[regular]{file} \verb+h1_barplot.R+}; + \node (n2) at (4, -0.7) {\faIcon[regular]{file} \verb+h1_barplot.png+}; + \path (fig.center) -- (n1.west); + \path (fig.center) -- (n2.west); + % tables + \node (tab) at (0, -1.5) {\faIcon{folder} \verb+tables+}; + \node (o1) at (4, -1.5) {\faIcon[regular]{file} \verb+h1_mean-table.Rmd+}; + \node (o2) at (4, -2.2) {\faIcon[regular]{file} \verb+h1_mean-table.docx+}; + \path (tab.center) -- (o1.west); + \path (tab.center) -- (o2.west); + \end{tikzpicture} + \pause + \item I export the data for figures and tables from \texttt{analysis/code} + to \texttt{dissemination/results} so the dissemination folder is + self-contained + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Several data sources} + \begin{itemize} + \item When you have several different data sources like questionnaires and + eye-tracking data keep them in separate folders + \begin{tikzpicture}[ + every node/.style = {text width = 4cm, align = left}, + every path/.style = {thick, draw} + ] + \node (data) at (0, 0) {\faIcon{folder} \verb+data+}; + \node (n1) at (4, 0) {\faIcon{folder} \verb+eyetracking+}; + \node (n2) at (4, -0.7) {\faIcon{folder} \verb+qualtrics+}; + \path (data.center) -- (n1.west); + \path (data.center) -- (n2.west); + \end{tikzpicture} + \pause + \item Process them separately, e.\,g., with + \verb+01a_preprocessing_eyetracking.R+ and + \verb+01b_preprocessing_surveys.R+ and then \verb+02_combine-data.R+ + \begin{tikzpicture}[ + every node/.style = {text width = 5cm, align = left}, + every path/.style = {thick, draw} + ] + \node (results) at (0, 0) {\faIcon{folder} \verb+results+}; + \node (n1) at (4, 0) {\faIcon[regular]{file} \verb+data_eyetracking.csv+}; + \node (n2) at (4, -0.7) {\faIcon[regular]{file} \verb+data_surveys.csv+}; + \node (n3) at (4, -1.4) {\faIcon[regular]{file} \verb+data_complete.csv+}; + \path (results.center) -- (n1.west); + \path (results.center) -- (n2.west); + \path (results.center) -- (n3.west); + \end{tikzpicture} + \end{itemize} +\end{frame} + +\begin{frame}{Toy example with 11 questions} + Thank you everybody for filling out our little toy survey in Qualtrics! + \vfill + \tiny +\begin{tabular}{lllll} + \hline + ResponseId & age & sex & data\_sharing\_1 & data\_sharing\_2 \\ + \hline +R\_225ffqhb7qRaIGO:1 & Min. :24.00 & m : 2 & No :7 & Min. :1.000 \\ + R\_2F9fXxf3NedHqZl:1 & 1st Qu.:26.50 & f :11 & Yes:7 & 1st Qu.:1.000 \\ + R\_2foYj4iSgaBTkEO:1 & Median :28.00 & d : 1 & & Median :2.000 \\ + R\_2J9B4aLaasQ1m81:1 & Mean :29.86 & not indicated: 0 & & Mean :2.214 \\ + R\_2P1TMDNlwm0gSIk:1 & 3rd Qu.:30.00 & & & 3rd Qu.:2.000 \\ + R\_2pXfOSq8DBImG6R:1 & Max. :43.00 & & & Max. :6.000 \\ + (Other) :8 & & & & \\ + \hline +\end{tabular} + + \vspace{.5cm} +\begin{tabular}{lllllll} + \hline + rdm\_stmnt\_1 & rdm\_stmnt\_2 & rdm\_stmnt\_3 & rdm\_stmnt\_4 & rdm\_stmnt\_5 & career\_level\_1 & career\_level\_2 \\ + \hline +Min. :2.000 & Min. :2 & Min. :2.000 & Min. :1.000 & Min. :1.000 & Student : 0 & Min. : 1.000 \\ + 1st Qu.:3.250 & 1st Qu.:4 & 1st Qu.:2.250 & 1st Qu.:1.000 & 1st Qu.:1.000 & PhD student :11 & 1st Qu.: 1.625 \\ + Median :4.500 & Median :4 & Median :3.000 & Median :1.000 & Median :1.000 & Postdoc : 1 & Median : 2.500 \\ + Mean :4.071 & Mean :4 & Mean :2.857 & Mean :1.143 & Mean :1.143 & Senior researcher: 0 & Mean : 5.964 \\ + 3rd Qu.:5.000 & 3rd Qu.:5 & 3rd Qu.:3.000 & 3rd Qu.:1.000 & 3rd Qu.:1.000 & Professor : 1 & 3rd Qu.: 4.500 \\ + Max. :5.000 & Max. :5 & Max. :5.000 & Max. :2.000 & Max. :2.000 & Other : 1 & Max. :38.000 \\ + & NA's :1 & & & & & \\ + \hline +\end{tabular} +\end{frame} + +% print(xtable::xtable(summary(dat[, 1:5])), include.rownames = FALSE) +% print(xtable::xtable(summary(dat[, 6:12])), include.rownames = FALSE) + +\begin{frame}[fragile]{Folder structure for toy example} + {One possible structure!} + \begin{tikzpicture}[ + every node/.style = {text width = 4.3cm, align = left}, + every path/.style = {thick, draw} + ] + \node (ex) at (0, 0) {\faIcon{folder} \verb+example+}; + \node (n1) at (3, 0) {\faIcon{folder} \verb+code+}; + \node (n2) at (3, -0.7) {\faIcon{folder} \verb+data+}; + \node (n3) at (3, -1.4) {\faIcon[regular]{file} \verb+README.md+}; + \path (ex.center) -- (n1.west); + \path (ex.center) -- (n2.west); + \path (ex.center) -- (n3.west); + + \node (o1) at (7, 0.7) {\faIcon[regular]{file} \verb+01_preprocessing.R+}; + \node (o2) at (7, -0.7) {\faIcon{folder} \verb+codebook+}; + \node (o3) at (7, -1.4) {\faIcon{folder} \verb+rawdata+}; + \node (o4) at (7, -2.1) {\faIcon{folder} \verb+results+}; + \path (n1.center) -- (o1.west); + \path (n2.center) -- (o2.west); + \path (n2.center) -- (o3.west); + \path (n2.center) -- (o4.west); + + \node (p1) at (11, -0.7) {\faIcon[regular]{file} \verb+codebook_01.R+}; + \node (p2) at (11, -1.4) {\faIcon[regular]{file} \verb+codebook_01.xlsx+}; + \node (p3) at (11, -2.1) {\dots}; + + \path (o2.center) -- (p1.west); + \path (o2.center) -- (p2.west); + \path (o2.center) -- (p3.west); + \end{tikzpicture} + +\end{frame} + +\section{Metadata} + +\begin{frame}{Metadata answers questions} + \begin{itemize} + \item {\bf Who} created the data? + \item {\bf Why} was the data created? + \item {\bf When} was the data created? + \item {\bf Where} is the data? + \item {\bf How} was the data created? + \item {\bf What} is the content of the data? + \end{itemize} + \vfill + \hfill{\tiny \citet{Wilbrandt2023}} +\end{frame} + +\begin{frame}{Metadata} + \begin{block}{Metadata} + \dots is data about data.\\ + \dots can be \emph{descriptive}, \emph{structural}, or \emph{administrative}. + \end{block} + \vfill + \begin{columns} + \begin{column}[t]{.5\textwidth} + Contains information on origin and background of data like + \begin{itemize} + \item Who, when, why, how, \dots + \item Used resources + \item Used abbreviations, units, names + \item Licenses + \item \dots + \end{itemize} + \end{column} + \begin{column}[t]{.5\textwidth} + Data can be anything like + \begin{itemize} + \item Book content + \item Pictures or audio files + \item Website content or a blog post + \item Journal paper + \item Research data + \item \dots + \end{itemize} + \end{column} + \end{columns} + \vfill +\end{frame} + +\begin{frame}{Metadata examples} +{Photo} + \begin{center} + \includegraphics[scale = .31]{../figures/metadata_photo} + \end{center} +\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}} +\end{frame} + +\begin{frame}{Metadata examples} +{Book} + \begin{center} + \includegraphics[scale = .36]{../figures/metadata_book} + \end{center} +\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}} +\end{frame} + +\begin{frame}{Metadata examples} +{Webpage} + \begin{center} + \includegraphics[scale = .27]{../figures/metadata_webpage} + \end{center} +\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}} +\end{frame} + +\begin{frame}{Metadata examples} +{WORD document} + \begin{center} + \includegraphics[scale = .23]{../figures/metadata_word_document} + \end{center} +\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}} +\end{frame} + +\begin{frame}{Metadata for research data} +\begin{tikzpicture} +\node[font=\Large] (n1) at (0,0) {\bf \color{iwmorange} Study}; + +\node[font=\large] (i1) at (0,-1) {$\bullet$ Persons}; +\node[font=\large] (i2) at (.36,-1.5) {$\bullet$ Background}; +\node[font=\large] (i3) at (.03,-2) {$\bullet$ Funding}; +\node[font=\large] (i4) at (-.38,-2.5) {$\bullet$ \dots}; +\node[draw=iwmorange, thick, fit={(n1) (i1) (i2) (i3) (i4)}, inner sep=10pt] (box) {}; + +\node[font=\Large] (n2) at (5,0) {\bf \color{iwmorange} Data set}; + +\node[font=\large] (j1) at (4.3,-1) {$\bullet$ Files}; +\node[font=\large] (j2) at (4.57,-1.5) {$\bullet$ Sources}; +\node[font=\large] (j3) at (4.65,-2) {$\bullet$ Methods}; +\node[font=\large] (j4) at (4.18,-2.5) {$\bullet$ \dots}; +\node[draw=iwmorange, thick, fit={(n2) (j1) (j2) (j3) (j4)}, inner sep=10pt] (box) {}; + +\node[font=\Large] (n3) at (10,0) {\bf \color{iwmorange} Variables}; + +\node[font=\large] (k1) at (9.7,-1) {$\bullet$ Data type}; +\node[font=\large] (k2) at (9.69,-1.5) {$\bullet$ Scale unit}; +\node[font=\large] (k3) at (9.85,-2) {$\bullet$ Value range}; +\node[font=\large] (k4) at (9.12,-2.5) {$\bullet$ \dots}; +\node[draw=iwmorange, thick, fit={(n3) (k1) (k2) (k3) (k4)}, inner sep=10pt] (box) {}; + +\draw[-latex, thick] (n1) -- (n2); +\draw[-latex, thick] (n2) -- (n3); +\end{tikzpicture} + \vfill + \hfill\tiny \url{https://datamanagement.hms.harvard.edu/collect/readme-files} +\end{frame} + +\section{README files} + +\begin{frame}{README files} + \begin{itemize} + \item Can be used to give information about all levels in a research + project: study/project, data set, variables; either in one README or in + several ones + \item Should provide a clear and concise description of all relevant details + about data collection, processing, and analysis + \item README files are created for different purposes: + \begin{itemize} + \item to document changes to files or file names within a folder + \item to explain file naming conventions, practices, etc.\ ``in + general'' for future reference + \item to specifically accompany files/data being deposited in a + repository + \end{itemize} + \item Creating a README file at the beginning of your research process, + and updating it consistently throughout your research, will help you + to compile a final README file when your data is ready for deposit + \item Find a template here: + \url{https://cornell.app.box.com/v/ReadmeTemplate} + \end{itemize} + \vfill + \hfill\tiny \url{https://datamanagement.hms.harvard.edu/collect/readme-files} +\end{frame} + +\begin{frame}{Study/project}{README on top level} + \begin{itemize} + \item Project name and purpose + \item Funding information (process number!) + \item Ethics approved? LEK number! + \item Person(s) responsible for study conduction + \item One or several studies? Infos about them + \item Time/Duration of project + \item \dots + \end{itemize} +\end{frame} + +\begin{frame}{Data set}{README accompanying data set(s)} + \begin{itemize} + \item One or more data sets? + \item Time of data collection + \item Person(s) responsible for data collection + \item File organisation + \item Naming conventions + \item Preprocessing methods + \item Anything that is special about the data set(s) + \item Number of subjects + \item Variables + \item \dots + \end{itemize} +\end{frame} + +\begin{frame}{Variables}{README accompanying a specific data set} + \begin{itemize} + \item You can use a README (or text file called \texttt{codebook.txt} or + similar) to document your variables + \item Especially, if you only have a few variables, this is an easy and fast + way to document them + \item If you are working with extensive surveys or questionnaires, it might + be a good time investment to create a more elaborate codebook + \end{itemize} + \vfill +\end{frame} + +\section{Codebooks} + +\begin{frame}{What information about variables should a codebook include?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 3} + + \url{https://app.sli.do/event/3S1Bn3Tjknuk5J5WiqAYzG} +\end{frame} + +\begin{frame}{A codebook should include} + \begin{tabular}{lp{11cm}} + \hline + Variable name & Usually some abbreviation like \texttt{pna01} \\ + Variable label & Brief description to identify variable \\ + Question text & If applicable, exact wording from survey question \\ + Values & Values variable can take (e.\,g, 1 to 5) \\ + Value labels & If applicable, textual descriptions of the values \\ + Statistics & For example, range, mean, standard deviation for + numeric variables; frequencies and percentages for categorical variables \\ + Missing data & If applicable, values and labels of missing data \\ + Notes & Additional notes, remarks, or comments; for measures or + questions from copyrighted instruments, the notes field can be used to + cite the source \\ + \hline + \end{tabular} + \vfill + + \hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983} +\end{frame} + +\begin{frame}{Codebooks} + \begin{itemize} + \item There are many different ways to create a codebook + \item It can be a README, some other plain text file, a table (stored as CSV + or XLSX), a WORD document, or PDF + \item For a short questionnaire, it can be sufficient to export it as a PDF + \item Let's walk through a couple of options\dots + \end{itemize} + \vfill +\end{frame} + +\begin{frame}{Option 1 -- Toy example with 11 questions} + {Simple PDF} + \begin{columns} + \begin{column}{.5\textwidth} + \begin{center} + \vspace{-.4cm} + Export from Qualtrics\\ + \includegraphics[scale = .3]{../figures/codebook_1.png} + \end{center} + \end{column} + \begin{column}{.6\textwidth} + \begin{itemize} + \item For a simple questionnaire like this, the exported WORD document + from Qualtrics exported to PDF might be sufficient as a codebook + \item For longer questionnaires, the WORD document can still be a good + starting point to create a more elaborate codebook + \end{itemize} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}[fragile]{Option 2 -- Toy example with 11 questions} + {Plain text file} + \begin{center} + \vspace{-.3cm} + \footnotesize +\begin{lstlisting}[language = bash, identifierstyle=\color{iwmgray}] + sex. Please indicate your sex. + ------------------------------------------------------------------------------- + -1. m + -2. f + -3. d + -4. not indicated + + age. How old are you? Please enter your age in years. + ------------------------------------------------------------------------------- + numerical input + + data_sharing_1. Have you ever published data in a repository? + ------------------------------------------------------------------------------- + -1. No + -2. Yes +\end{lstlisting} + \end{center} +\end{frame} + +\begin{frame}[fragile]{Option 3 -- Toy example with 11 questions} + {Creating a simple codebook in R ``by hand''} +\footnotesize + \begin{lstlisting} +load("results/data_rdm-ms-ss2024_cleaned.RData") +codebook <- data.frame(var_name = names(dat), + var_text = c("Response Id", "Please indicate your sex.", + "How old are you? Please enter your age in years.", + ... + "Sharing data is bad scientific practice", + "What is your current career level?", + "How long have you been working in science (in years)?")) + +codebook$type <- sapply(dat, class) +codebook$n <- sapply(dat, length) +codebook$mean <- sapply(dat, + function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA)) +codebook$sd <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA)) + +openxlsx::write.xlsx(codebook, file = "codebook/codebook_01.xlsx") + \end{lstlisting} +\end{frame} + +\begin{frame}[fragile]{Option 3 -- Toy example with 11 questions} + {Creating a simple codebook in R ``by hand''} + \begin{center} + \includegraphics[scale = .6]{../figures/codebook_2.png} + \end{center} +\end{frame} + +\begin{frame}[fragile]{Option 4 -- Toy example with 11 questions} + {Using the codebook package in R} + \begin{itemize} + \item When you export a qualtrics questionnaire as SPSS file and import it + into R using the haven package, you can use RMarkdown to create an + elaborate HTML codebook + \item It works best for classical questionnaire items + \item In our example, the survey is not formatted well enough for the + generated codebook to be completely correct + \end{itemize} + \footnotesize + \begin{lstlisting} +#' --- +#' title: Codebook for Data Set "RDM MS SS 2024" +#' author: Nora Wickelmaier +#' --- + +#+ echo = FALSE +dat <- haven::read_spss("../rawdata/RDM_MS_SS2024_download_2024-06-04.sav") +codebook::codebook(dat) + \end{lstlisting} +\end{frame} + +\appendix +%%\begin{frame}[allowframebreaks]{References} +\begin{frame}{References} +%\renewcommand{\bibfont}{\small} + \printbibliography +\vfill +\end{frame} + +\end{document} + diff --git a/03_data_organisation/example/README.md b/03_data_organisation/example/README.md new file mode 100644 index 0000000..02510ed --- /dev/null +++ b/03_data_organisation/example/README.md @@ -0,0 +1,47 @@ +# Toy data set for the methods seminar on data management SS2024 + +## Responsible person + +Nora Wickelmaier +Referentin Forschungsmethoden und Forschungsdatenmanagement +Leibniz-Institut für Wissensmedien (IWM) +n.wickelmaier@iwm-tuebingen.de + +## Folder structure and naming conventions + +``` +/example/ +| +|- /code/ +|- /data/ + |- /codebook/ + |- /rawdata/ + |- /results/ +``` + +The `code` folder contains analysis scripts written in R. The scripts are +numbered, indicating the order they should be executed in. + +The `data` folder contains all folders associated with data and its +documentation. + +The `code` folder with contains different codebook options and R scripts that +create these codebooks. If the codebook is created by an R script, the script +and the codebook are named identically, e.g., `codebook_01.R` and +`codebook_01.xslx`. + +The `rawdata` folder contains the downloads from Qualtrics. In Qualtrics, the +variables have been selected and ordered and then downloaded, without the +additional columns Qualtrics adds by default. The naming convention for the +downloaded files is +``` +RDM_MS_SS2024_download_. +``` +No other files than the downloads from qualtrics should go into this folder! + +The `results` folder contains processed data. The scripts in `/code/` process +the data from `/rawdata/` and saves the files containing the processed data to +`/results/`. Data can be exported as CSV files or RData files. If different file +formats contain the same data, they should be named identically, e.g., +`data_rdm-ms-ss2024_cleaned.csv` and `data_rdm-ms-ss2024_cleaned.RData`. + diff --git a/03_data_organisation/example/code/01_preprocessing.R b/03_data_organisation/example/code/01_preprocessing.R new file mode 100644 index 0000000..4072709 --- /dev/null +++ b/03_data_organisation/example/code/01_preprocessing.R @@ -0,0 +1,78 @@ +# 01_preprocessing.R +# +# Cleaning up data for toy data set Methods Seminar SS2024 +# +# Input: RDM_MS_SS2024_download_2024-06-07.csv +# Output: results/data_rdm-ms-ss2024_cleaned.csv +# results/data_rdm-ms-ss2024_cleaned.RData +# +# created: 2024-06-03 + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/") + +dat <- read.table("data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv", + sep = ",", skip = 3, stringsAsFactors = TRUE, na.string = "") + +names(dat) <- + readLines("data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv", 1) |> + strsplit(split = ",") |> + unlist() + +# Clean up variables +dat$ResponseId <- factor(dat$ResponseId) + +dat$sex <- factor(dat$sex, + levels = c("m", "f", "d", "not indicated")) + +dat$data_sharing_1 <- factor(dat$data_sharing_1, + levels = c("No", "Yes")) + +dat$career_level_1 <- factor(dat$career_level_1, + levels = c("Student", "PhD student", "Postdoc", + "Senior researcher", "Professor", + "Other")) + +dat$rdm_stmnt_1 <- factor(dat$rdm_stmnt_1, + levels = c("Strongly disagree", "Disagree", + "Neither agree nor disagree", "Agree", + "Strongly agree")) + +dat$rdm_stmnt_2 <- factor(dat$rdm_stmnt_2, + levels = c("Strongly disagree", "Disagree", + "Neither agree nor disagree", "Agree", + "Strongly agree")) + +dat$rdm_stmnt_3 <- factor(dat$rdm_stmnt_3, + levels = c("Strongly disagree", "Disagree", + "Neither agree nor disagree", "Agree", + "Strongly agree")) + +dat$rdm_stmnt_4 <- factor(dat$rdm_stmnt_4, + levels = c("Strongly disagree", "Disagree", + "Neither agree nor disagree", "Agree", + "Strongly agree")) + +dat$rdm_stmnt_5 <- factor(dat$rdm_stmnt_5, + levels = c("Strongly disagree", "Disagree", + "Neither agree nor disagree", "Agree", + "Strongly agree")) + +## Fix data_sharing_2 +dat$data_sharing_2[dat$data_sharing_2 == "1 out of 4"] <- 1 +dat$data_sharing_2 <- as.numeric(dat$data_sharing_2) + +# Create numeric statement variables + +dat$rdm_stmnt_1 <- as.numeric(dat$rdm_stmnt_1) +dat$rdm_stmnt_2 <- as.numeric(dat$rdm_stmnt_2) +dat$rdm_stmnt_3 <- as.numeric(dat$rdm_stmnt_3) +dat$rdm_stmnt_4 <- as.numeric(dat$rdm_stmnt_5) +dat$rdm_stmnt_4 <- as.numeric(dat$rdm_stmnt_4) +dat$rdm_stmnt_5 <- as.numeric(dat$rdm_stmnt_5) + +# Save cleaned data set +write.table(dat, file = "data/results/data_rdm-ms-ss2024_cleaned.csv", sep = ";", + row.names = FALSE, quote = FALSE) + +save(dat, file = "data/results/data_rdm-ms-ss2024_cleaned.RData") + diff --git a/03_data_organisation/example/data/codebook/RDM_MS_SS2024.docx b/03_data_organisation/example/data/codebook/RDM_MS_SS2024.docx new file mode 100644 index 0000000..3997600 Binary files /dev/null and b/03_data_organisation/example/data/codebook/RDM_MS_SS2024.docx differ diff --git a/03_data_organisation/example/data/codebook/codebook_01.R b/03_data_organisation/example/data/codebook/codebook_01.R new file mode 100644 index 0000000..085d470 --- /dev/null +++ b/03_data_organisation/example/data/codebook/codebook_01.R @@ -0,0 +1,41 @@ +# codebook_generation_01.R +# +# Code generation example +# +# Input: results/data_rdm-ms-ss2024_cleaned.RData +# Output: +# +# created: 2024-06-04 + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/") + +load("data/results/data_rdm-ms-ss2024_cleaned.RData") + +codebook <- data.frame(var_name = names(dat), + var_text = c("Response Id", "Please indicate your sex.", + "How old are you? Please enter your age in years.", + "Have you ever published data in a repository?", + "How many of your data sets have you published so far?", + "All my analyses are preregistered", + "Sharing my data is very important to me", + "I invest more time in research data management than my colleagues", + "I think research data management is overrated", + "Sharing data is bad scientific practice", + "What is your current career level?", + "How long have you been working in science (in years)?") + +) + +codebook$type <- sapply(dat, class) +codebook$n <- sapply(dat, length) +codebook$mean <- sapply(dat, function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA)) +codebook$sd <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA)) + +write.table(codebook, + file = "data/codebook/codebook_01.csv", + na = "", + sep = ";", + quote = FALSE) + +openxlsx::write.xlsx(codebook, file = "data/codebook/codebook_01.xlsx") + diff --git a/03_data_organisation/example/data/codebook/codebook_01.csv b/03_data_organisation/example/data/codebook/codebook_01.csv new file mode 100644 index 0000000..3ad5ba3 --- /dev/null +++ b/03_data_organisation/example/data/codebook/codebook_01.csv @@ -0,0 +1,13 @@ +var_name;var_text;type;n;mean;sd +1;ResponseId;Response Id;factor;13;; +2;age;Please indicate your sex.;integer;13;29.6923076923077;5.99144689515278 +3;sex;How old are you? Please enter your age in years.;factor;13;; +4;data_sharing_1;Have you ever published data in a repository?;factor;13;; +5;data_sharing_2;How many of your data sets have you published so far?;numeric;13;2.30769230769231;1.65250392761083 +6;rdm_stmnt_1;All my analyses are preregistered;numeric;13;4.15384615384615;1.14354374979373 +7;rdm_stmnt_2;Sharing my data is very important to me;numeric;13;4; +8;rdm_stmnt_3;I invest more time in research data management than my colleagues;numeric;13;2.84615384615385;0.800640769025436 +9;rdm_stmnt_4;I think research data management is overrated;numeric;13;1.15384615384615;0.375533808099405 +10;rdm_stmnt_5;Sharing data is bad scientific practice;numeric;13;1.15384615384615;0.375533808099405 +11;career_level_1;What is your current career level?;factor;13;; +12;career_level_2;How long have you been working in science (in years)?;numeric;13;6.26923076923077;10.1788493632126 diff --git a/03_data_organisation/example/data/codebook/codebook_01.xlsx b/03_data_organisation/example/data/codebook/codebook_01.xlsx new file mode 100644 index 0000000..9b75665 Binary files /dev/null and b/03_data_organisation/example/data/codebook/codebook_01.xlsx differ diff --git a/03_data_organisation/example/data/codebook/codebook_02.R b/03_data_organisation/example/data/codebook/codebook_02.R new file mode 100644 index 0000000..54eb5de --- /dev/null +++ b/03_data_organisation/example/data/codebook/codebook_02.R @@ -0,0 +1,68 @@ +# codebook_generation_01.R +# +# Code generation example +# +# Input: results/data_rdm-ms-ss2024_cleaned.RData +# Output: +# +# created: 2024-06-04 + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/") + +dat <- as.data.frame(haven::read_spss("data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav")) + +## Fix data_sharing_2 +dat$data_sharing_2[dat$data_sharing_2 == "1 out of 4"] <- 1 + +# Look at attributes +attributes(dat$sex) + +# Create codebook with survey questions +codebook <- data.frame(variable = names(dat), + label = sapply(dat, function(x) attr(x, "label"))) + +# Clean up data frame +dat <- as.data.frame(lapply(dat, sjlabelled::unlabel)) + +sapply(dat, class) # Look at classes + +dat$age <- as.numeric(dat$age) +dat$career_level_2 <- as.numeric(dat$career_level_2) +dat$data_sharing_2 <- as.numeric(dat$data_sharing_2) + +dat$sex <- factor(dat$sex, + levels = 1:4, + labels = names(attr(dat$sex, "labels"))) +dat$data_sharing_1 <- factor(dat$data_sharing_1, + levels = 1:2, + labels = names(attr(dat$data_sharing_1, "labels"))) +dat$career_level_1 <- factor(dat$career_level_1, + levels = 1:6, + labels = names(attr(dat$career_level_1, "labels"))) + +# Add descriptive statistics to codebook + +codebook$n <- sapply(dat, length) +codebook$type <- sapply(dat, class) +codebook$mean <- sapply(dat, function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA)) +codebook$sd <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA)) + +# props <- function(x) { +# if (is.factor(x)) { +# proportions(summary(x)) +# } else { +# NA +# } +# } +# +# codebook$prop <- lapply(dat, props) + +write.table(codebook, + file = "data/codebook/codebook_02.csv", + na = "", + sep = ";", + quote = FALSE) + +openxlsx::write.xlsx(codebook, file = "data/codebook/codebook_02.xlsx") + + diff --git a/03_data_organisation/example/data/codebook/codebook_02.csv b/03_data_organisation/example/data/codebook/codebook_02.csv new file mode 100644 index 0000000..b927c88 --- /dev/null +++ b/03_data_organisation/example/data/codebook/codebook_02.csv @@ -0,0 +1,13 @@ +variable;label;n;type;mean;sd +ResponseId;ResponseId;Response ID;13;character;; +age;age;How old are you? Please enter your age in years.;13;numeric;29.6923076923077;5.99144689515278 +sex;sex;Please indicate your sex.;13;factor;; +data_sharing_1;data_sharing_1;Have you ever published data in a repository?;13;factor;; +data_sharing_2;data_sharing_2;How many of your data sets have you published so far?;13;numeric;1.38461538461538;1.85015591858549 +rdm_stmnt_1;rdm_stmnt_1;Please indicate how much you agree with the following statements - All my analyses are preregistered;13;numeric;4.15384615384615;1.14354374979373 +rdm_stmnt_2;rdm_stmnt_2;Please indicate how much you agree with the following statements - Sharing my data is very important to me;13;numeric;4; +rdm_stmnt_3;rdm_stmnt_3;Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues;13;numeric;2.84615384615385;0.800640769025436 +rdm_stmnt_4;rdm_stmnt_4;Please indicate how much you agree with the following statements - I think research data management is overrated;13;numeric;1.84615384615385;0.987096233585649 +rdm_stmnt_5;rdm_stmnt_5;Please indicate how much you agree with the following statements - Sharing data is bad scientific practice;13;numeric;1.15384615384615;0.375533808099405 +career_level_1;career_level_1;What is your current career level?;13;factor;; +career_level_2;career_level_2;How long have you been working in science (in years)?;13;numeric;6.26923076923077;10.1788493632126 diff --git a/03_data_organisation/example/data/codebook/codebook_02.xlsx b/03_data_organisation/example/data/codebook/codebook_02.xlsx new file mode 100644 index 0000000..1d85ef4 Binary files /dev/null and b/03_data_organisation/example/data/codebook/codebook_02.xlsx differ diff --git a/03_data_organisation/example/data/codebook/codebook_03.R b/03_data_organisation/example/data/codebook/codebook_03.R new file mode 100644 index 0000000..c4dd722 --- /dev/null +++ b/03_data_organisation/example/data/codebook/codebook_03.R @@ -0,0 +1,10 @@ +#' --- +#' title: Codebook for Data Set "RDM MS SS 2024" +#' author: Nora Wickelmaier +#' --- + +#+ echo = FALSE +dat <- haven::read_spss("../rawdata/RDM_MS_SS2024_download_2024-06-04.sav") + +codebook::codebook(dat) + diff --git a/03_data_organisation/example/data/codebook/codebook_manual.txt b/03_data_organisation/example/data/codebook/codebook_manual.txt new file mode 100644 index 0000000..af58e39 --- /dev/null +++ b/03_data_organisation/example/data/codebook/codebook_manual.txt @@ -0,0 +1,94 @@ +############################################################################### +This file contains an overview of the variables from a toy data set collected +at the methods seminar SS 2024. The raw data contain in +"RDM_MS_SS2024_download_2024-06-03_v1.csv" contain additional variables +created by Qualtrics. The variables have been preprocessed and are stored in +"data_rdm-ms-ss2024_cleaned.csv". +############################################################################### + + +ResponseId. +------------------------------------------------------------------------------- +random sequence of numbers, letters, and underscore + + +sex. Please indicate your sex. +------------------------------------------------------------------------------- +-1. m +-2. f +-3. d +-4. not indicated + + +age. How old are you? Please enter your age in years. +------------------------------------------------------------------------------- +numerical input + + +data_sharing_1. Have you ever published data in a repository? +------------------------------------------------------------------------------- +-1. No +-2. Yes + + +data_sharing_2. How many of your data sets have you published so far? +------------------------------------------------------------------------------- +numerical input + + +rdm_stmnt. Please indicate how much you agree with the following statements: + +rdm_stmnt_1. All my analyses are preregistered +------------------------------------------------------------------------------- +-1. Strongly disagree +-2. Disagree +-3. Neither agree nor disagree +-4. Agree +-5. Strongly agree + +rdm_stmnt_2. Sharing my data is very important to me +------------------------------------------------------------------------------- +-1. Strongly disagree +-2. Disagree +-3. Neither agree nor disagree +-4. Agree +-5. Strongly agree + +rdm_stmnt_3. I invest more time in research data management than my colleagues +------------------------------------------------------------------------------- +-1. Strongly disagree +-2. Disagree +-3. Neither agree nor disagree +-4. Agree +-5. Strongly agree + +rdm_stmnt_4. I think research data management is overrated +------------------------------------------------------------------------------- +-1. Strongly disagree +-2. Disagree +-3. Neither agree nor disagree +-4. Agree +-5. Strongly agree + +rdm_stmnt_5. Sharing data is bad scientific practice +------------------------------------------------------------------------------- +-1. Strongly disagree +-2. Disagree +-3. Neither agree nor disagree +-4. Agree +-5. Strongly agree + + +career_level_1. What is your current career level? +------------------------------------------------------------------------------- +-1. Student +-2. PhD student +-3. Postdoc +-4. Senior researcher +-5 Professor +-6. Other + + +career_level_2. How long have you been working in science (in years)? +------------------------------------------------------------------------------- +numerical input diff --git a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.csv b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.csv new file mode 100644 index 0000000..dcaa89e --- /dev/null +++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.csv @@ -0,0 +1,16 @@ +ResponseId,age,sex,data_sharing_1,data_sharing_2,rdm_stmnt_1,rdm_stmnt_2,rdm_stmnt_3,rdm_stmnt_4,rdm_stmnt_5,career_level_1,career_level_2 +Response ID,How old are you? Please enter your age in years.,Please indicate your sex.,Have you ever published data in a repository?,How many of your data sets have you published so far?,Please indicate how much you agree with the following statements - All my analyses are preregistered,Please indicate how much you agree with the following statements - Sharing my data is very important to me,Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues,Please indicate how much you agree with the following statements - I think research data management is overrated,Please indicate how much you agree with the following statements - Sharing data is bad scientific practice,What is your current career level?,How long have you been working in science (in years)? +"{""ImportId"":""_recordId""}","{""ImportId"":""QID3_TEXT""}","{""ImportId"":""QID1""}","{""ImportId"":""QID4""}","{""ImportId"":""QID7_TEXT""}","{""ImportId"":""QID2_1""}","{""ImportId"":""QID2_2""}","{""ImportId"":""QID2_3""}","{""ImportId"":""QID2_4""}","{""ImportId"":""QID2_5""}","{""ImportId"":""QID8""}","{""ImportId"":""QID9_TEXT""}" +R_8q7OpSkcuPT7SbI,42,f,No,1,Neither agree nor disagree,Agree,Strongly agree,Strongly disagree,Strongly disagree,Other,14 +R_8Io4pbk0A1a37VL,28,f,Yes,1,Strongly agree,,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,1 +R_2J9B4aLaasQ1m81,28,f,Yes,1 out of 4,Strongly agree,Strongly agree,Disagree,Disagree,Strongly disagree,PhD student,3 +R_80kqWr3W48SgiUZ,43,f,Yes,6,Agree,Agree,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,3 +R_8QpI8T0rjTjaPPr,30,f,Yes,4,Strongly agree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,5 +R_8QoVv6THz1Qjtuz,28,f,Yes,1,Disagree,Disagree,Disagree,Agree,Strongly disagree,Professor,38 +R_2F9fXxf3NedHqZl,25,d,No,0,Agree,Strongly agree,Disagree,Neither agree nor disagree,Disagree,PhD student,2 +R_2foYj4iSgaBTkEO,24,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1 +R_83T6Oak5vI6GNJ7,30,f,Yes,1,Strongly agree,Agree,Neither agree nor disagree,Neither agree nor disagree,Strongly disagree,Postdoc,7 +R_2Vz26rWsOLYwqnD,25,m,Yes,3,Agree,Agree,Neither agree nor disagree,Disagree,Disagree,PhD student,2 +R_8HcBgUUm1BXFfhv,29,m,No,0,Strongly agree,Disagree,Disagree,Strongly disagree,Strongly disagree,PhD student,3 +R_2P1TMDNlwm0gSIk,26,f,No,0,Disagree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1.5 +R_225ffqhb7qRaIGO,28,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1 diff --git a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav new file mode 100644 index 0000000..364802a Binary files /dev/null and b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav differ diff --git a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sps b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sps new file mode 100644 index 0000000..4a91fdf --- /dev/null +++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sps @@ -0,0 +1,66 @@ +* Encoding: UTF-8. +TITLE "RDM_MS_SS2024". +SUBTITLE "". +VARIABLE LABELS +ResponseId "Response ID" +age "How old are you? Please enter your age in years." +sex "Please indicate your sex." +data_sharing_1 "Have you ever published data in a repository?" +data_sharing_2 "How many of your data sets have you published so far?" +rdm_stmnt_1 "Please indicate how much you agree with the following statements - All my analyses are preregistered" +rdm_stmnt_2 "Please indicate how much you agree with the following statements - Sharing my data is very important to me" +rdm_stmnt_3 "Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues" +rdm_stmnt_4 "Please indicate how much you agree with the following statements - I think research data management is overrated" +rdm_stmnt_5 "Please indicate how much you agree with the following statements - Sharing data is bad scientific practice" +career_level_1 "What is your current career level?" +career_level_2 "How long have you been working in science (in years)?" +. +VALUE LABELS + /sex + 1 "m" + 2 "f" + 3 "d" + 4 "not indicated" + /data_sharing_1 + 1 "No" + 2 "Yes" + /rdm_stmnt_1 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_2 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_3 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_4 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_5 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /career_level_1 + 1 "Student" + 2 "PhD student" + 3 "Postdoc" + 4 "Senior researcher" + 5 "Professor" + 6 "Other" +. +CACHE. +EXECUTE. diff --git a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv new file mode 100644 index 0000000..e0fef0f --- /dev/null +++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv @@ -0,0 +1,17 @@ +ResponseId,age,sex,data_sharing_1,data_sharing_2,rdm_stmnt_1,rdm_stmnt_2,rdm_stmnt_3,rdm_stmnt_4,rdm_stmnt_5,career_level_1,career_level_2 +Response ID,How old are you? Please enter your age in years.,Please indicate your sex.,Have you ever published data in a repository?,How many of your data sets have you published so far?,Please indicate how much you agree with the following statements - All my analyses are preregistered,Please indicate how much you agree with the following statements - Sharing my data is very important to me,Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues,Please indicate how much you agree with the following statements - I think research data management is overrated,Please indicate how much you agree with the following statements - Sharing data is bad scientific practice,What is your current career level?,How long have you been working in science (in years)? +"{""ImportId"":""_recordId""}","{""ImportId"":""QID3_TEXT""}","{""ImportId"":""QID1""}","{""ImportId"":""QID4""}","{""ImportId"":""QID7_TEXT""}","{""ImportId"":""QID2_1""}","{""ImportId"":""QID2_2""}","{""ImportId"":""QID2_3""}","{""ImportId"":""QID2_4""}","{""ImportId"":""QID2_5""}","{""ImportId"":""QID8""}","{""ImportId"":""QID9_TEXT""}" +R_8q7OpSkcuPT7SbI,42,f,No,1,Neither agree nor disagree,Agree,Strongly agree,Strongly disagree,Strongly disagree,Other,14 +R_8Io4pbk0A1a37VL,28,f,Yes,1,Strongly agree,,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,1 +R_2J9B4aLaasQ1m81,28,f,Yes,1 out of 4,Strongly agree,Strongly agree,Disagree,Disagree,Strongly disagree,PhD student,3 +R_80kqWr3W48SgiUZ,43,f,Yes,6,Agree,Agree,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,3 +R_8QpI8T0rjTjaPPr,30,f,Yes,4,Strongly agree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,5 +R_8QoVv6THz1Qjtuz,28,f,Yes,1,Disagree,Disagree,Disagree,Agree,Strongly disagree,Professor,38 +R_2F9fXxf3NedHqZl,25,d,No,0,Agree,Strongly agree,Disagree,Neither agree nor disagree,Disagree,PhD student,2 +R_2foYj4iSgaBTkEO,24,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1 +R_83T6Oak5vI6GNJ7,30,f,Yes,1,Strongly agree,Agree,Neither agree nor disagree,Neither agree nor disagree,Strongly disagree,Postdoc,7 +R_2Vz26rWsOLYwqnD,25,m,Yes,3,Agree,Agree,Neither agree nor disagree,Disagree,Disagree,PhD student,2 +R_8HcBgUUm1BXFfhv,29,m,No,0,Strongly agree,Disagree,Disagree,Strongly disagree,Strongly disagree,PhD student,3 +R_2P1TMDNlwm0gSIk,26,f,No,0,Disagree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1.5 +R_225ffqhb7qRaIGO,28,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1 +R_2pXfOSq8DBImG6R,32,f,No,0,Neither agree nor disagree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,2 diff --git a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sav b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sav new file mode 100644 index 0000000..fa5e00b Binary files /dev/null and b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sav differ diff --git a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sps b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sps new file mode 100644 index 0000000..4a91fdf --- /dev/null +++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sps @@ -0,0 +1,66 @@ +* Encoding: UTF-8. +TITLE "RDM_MS_SS2024". +SUBTITLE "". +VARIABLE LABELS +ResponseId "Response ID" +age "How old are you? Please enter your age in years." +sex "Please indicate your sex." +data_sharing_1 "Have you ever published data in a repository?" +data_sharing_2 "How many of your data sets have you published so far?" +rdm_stmnt_1 "Please indicate how much you agree with the following statements - All my analyses are preregistered" +rdm_stmnt_2 "Please indicate how much you agree with the following statements - Sharing my data is very important to me" +rdm_stmnt_3 "Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues" +rdm_stmnt_4 "Please indicate how much you agree with the following statements - I think research data management is overrated" +rdm_stmnt_5 "Please indicate how much you agree with the following statements - Sharing data is bad scientific practice" +career_level_1 "What is your current career level?" +career_level_2 "How long have you been working in science (in years)?" +. +VALUE LABELS + /sex + 1 "m" + 2 "f" + 3 "d" + 4 "not indicated" + /data_sharing_1 + 1 "No" + 2 "Yes" + /rdm_stmnt_1 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_2 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_3 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_4 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /rdm_stmnt_5 + 1 "Strongly disagree" + 2 "Disagree" + 3 "Neither agree nor disagree" + 4 "Agree" + 5 "Strongly agree" + /career_level_1 + 1 "Student" + 2 "PhD student" + 3 "Postdoc" + 4 "Senior researcher" + 5 "Professor" + 6 "Other" +. +CACHE. +EXECUTE. diff --git a/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.RData b/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.RData new file mode 100644 index 0000000..bc0fbd9 Binary files /dev/null and b/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.RData differ diff --git a/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.csv b/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.csv new file mode 100644 index 0000000..58b1ca2 --- /dev/null +++ b/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.csv @@ -0,0 +1,15 @@ +ResponseId;age;sex;data_sharing_1;data_sharing_2;rdm_stmnt_1;rdm_stmnt_2;rdm_stmnt_3;rdm_stmnt_4;rdm_stmnt_5;career_level_1;career_level_2 +R_8q7OpSkcuPT7SbI;42;f;No;2;3;4;5;1;1;Other;14 +R_8Io4pbk0A1a37VL;28;f;Yes;2;5;NA;3;1;1;PhD student;1 +R_2J9B4aLaasQ1m81;28;f;Yes;2;5;5;2;1;1;PhD student;3 +R_80kqWr3W48SgiUZ;43;f;Yes;6;4;4;3;1;1;PhD student;3 +R_8QpI8T0rjTjaPPr;30;f;Yes;5;5;4;3;1;1;PhD student;5 +R_8QoVv6THz1Qjtuz;28;f;Yes;2;2;2;2;1;1;Professor;38 +R_2F9fXxf3NedHqZl;25;d;No;1;4;5;2;2;2;PhD student;2 +R_2foYj4iSgaBTkEO;24;f;No;1;5;5;3;1;1;PhD student;1 +R_83T6Oak5vI6GNJ7;30;f;Yes;2;5;4;3;1;1;Postdoc;7 +R_2Vz26rWsOLYwqnD;25;m;Yes;4;4;4;3;2;2;PhD student;2 +R_8HcBgUUm1BXFfhv;29;m;No;1;5;2;2;1;1;PhD student;3 +R_2P1TMDNlwm0gSIk;26;f;No;1;2;4;3;1;1;PhD student;1.5 +R_225ffqhb7qRaIGO;28;f;No;1;5;5;3;1;1;PhD student;1 +R_2pXfOSq8DBImG6R;32;f;No;1;3;4;3;1;1;PhD student;2 diff --git a/figures/QR Code for Methodenseminar SS 2024 - Session 3.png b/figures/QR Code for Methodenseminar SS 2024 - Session 3.png new file mode 100644 index 0000000..5a55340 Binary files /dev/null and b/figures/QR Code for Methodenseminar SS 2024 - Session 3.png differ diff --git a/figures/codebook_1.png b/figures/codebook_1.png new file mode 100644 index 0000000..011e608 Binary files /dev/null and b/figures/codebook_1.png differ diff --git a/figures/codebook_2.png b/figures/codebook_2.png new file mode 100644 index 0000000..2a91bf6 Binary files /dev/null and b/figures/codebook_2.png differ diff --git a/figures/email_data_request_2024_03.png b/figures/email_data_request_2024_03.png new file mode 100644 index 0000000..f81490b Binary files /dev/null and b/figures/email_data_request_2024_03.png differ diff --git a/figures/email_data_request_2024_04.png b/figures/email_data_request_2024_04.png new file mode 100644 index 0000000..f5c3cf3 Binary files /dev/null and b/figures/email_data_request_2024_04.png differ