Slides and example for third session

2024-06-07 13:47:03 +02:00 · 2024-06-07 13:47:03 +02:00 · f1f7f35988
commit f1f7f35988
parent 102834032c
25 changed files with 1388 additions and 0 deletions
--- a/03_data_organisation/03_data_organisation.tex
+++ b/03_data_organisation/03_data_organisation.tex
@ -0,0 +1,844 @@
+\documentclass[aspectratio=169]{beamer}
+
+\usepackage{listings}
+\usepackage[utf8,latin1]{inputenc}
+\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
+\addbibresource{../literature/lit.bib}
+
+\usepackage{fancyvrb}
+\usepackage{fontawesome5}                % get icons
+\usepackage{multirow}
+\usepackage{color, colortbl}
+
+\usepackage{tikz}
+\usetikzlibrary{fit}
+\usepackage[edges]{forest}
+
+\lstset{language=R,%
+  backgroundcolor=\color{iwmgray!15!white},
+  basicstyle=\ttfamily\color{iwmgray},
+  frame=none,
+  commentstyle=\slshape\color{iwmgreen},
+  keywordstyle=\bfseries\color{iwmgray},
+  identifierstyle=\color{iwmpurple},
+  stringstyle=\color{iwmblue},
+  numbers=none,%left,numberstyle=\tiny,
+  basewidth={.5em, .4em},
+  showstringspaces=false,
+  emphstyle=\color{red!50!white}}
+
+\makeatletter \def\newblock{\beamer@newblock} \makeatother
+
+\beamertemplatenavigationsymbolsempty
+\setbeamertemplate{itemize items}[circle]
+\setbeamertemplate{section in toc}[circle]
+\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
+\setbeamercolor{block body}{bg=iwmorange!50!white}
+\setbeamercolor{block title}{fg=white, bg=iwmorange}
+% Definitions for biblatex
+\setbeamercolor{bibliography entry note}{fg=iwmgray}
+\setbeamercolor{bibliography entry author}{fg=iwmgray}
+\setbeamertemplate{bibliography item}{}
+
+\definecolor{iwmorange}{RGB}{255,105,0}
+\definecolor{iwmgray}{RGB}{67,79,79}
+\definecolor{iwmblue}{RGB}{60,180,220}
+\definecolor{iwmgreen}{RGB}{145,200,110}
+\definecolor{iwmpurple}{RGB}{120,0,75}
+
+\setbeamercolor{title}{fg=iwmorange}
+\setbeamercolor{frametitle}{fg=iwmorange}
+\setbeamercolor{structure}{fg=iwmorange}
+\setbeamercolor{normal text}{fg=iwmgray}
+\setbeamercolor{author}{fg=iwmgray}
+\setbeamercolor{date}{fg=iwmgray}
+
+\newcommand{\vect}[1]{\mathbf{#1}}
+\newcommand{\mat}[1]{\mathbf{#1}}
+\newcommand{\gvect}[1]{\boldsymbol{#1}}
+\newcommand{\gmat}[1]{\boldsymbol{#1}}
+
+\AtBeginSection[]{
+  \frame{
+    \tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
+
+\setbeamertemplate{headline}{
+ \begin{beamercolorbox}{section in head}
+   \vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
+ \end{beamercolorbox}
+}
+
+\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
+
+\title{Data organisation for effective research data management}
+\author{Nora Wickelmaier}
+\date{June 10, 2024}
+
+\begin{document}
+
+\begin{frame}{}
+\thispagestyle{empty}
+\titlepage
+\end{frame}
+
+\begin{frame}{Data request}
+  \begin{center}
+    \includegraphics[scale = .55]{../figures/email_data_request_2024_01}
+  \end{center}
+\end{frame}
+
+\begin{frame}{Data folder for the data requested}
+  \begin{center}
+    \includegraphics[scale = .6]{../figures/email_data_request_2024_03}
+  \end{center}
+\end{frame}
+
+\begin{frame}{What is bad about this data organisation?}
+  % slido
+  \centering
+  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 3}
+
+  \url{https://app.sli.do/event/3S1Bn3Tjknuk5J5WiqAYzG}
+\end{frame}
+
+\begin{frame}[<+->]{Bad things about this data organisation}
+  \begin{itemize}
+    \item Raw and processed data are in the same folder
+    \item File naming does not sort in a sensible way: Best order would be first
+      by subject, then by session
+    \item Data and data scripts are in the same folder
+    \item Data scripts are not numbered, unclear in what order they need to be
+      executed
+    \item There are plot files (PDFs) between the data and code files
+    \item It is unclear which are the final and processed data files
+    \item The final data files are not stored in an interoperable format: There
+      is only an \texttt{.RData} file that (probably) contains the final data
+      which was used for further analyses
+    \item There is no documentation whatsoever
+    \item \dots
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Topics for this semester}
+\centering
+\begin{tabular}{ll}
+\hline
+Date & Topic \\
+\hline
+2024-05-13 & Introduction to data management \\
+2024-05-27 & Workflow \\
+\only<1>{2024-06-10}\only<2>{\bf 2024-06-10} & \only<1>{Data organisation}\only<2>{\bf Data organisation}\\
+2024-06-24 & Data sharing                    \\
+2024-07-08 & Clean coding                    \\
+2024-07-22 & Version control                 \\
+\hline
+\end{tabular}
+\end{frame}
+
+% * different data sources
+% * content README file
+% * best arrangement of data
+% * redundancy
+% * anonymizing/pseudonymizing data
+
+\section{Folder organisation}
+
+\begin{frame}[<+->]{Some general rules}
+  \begin{itemize}
+    \item One project, one folder
+    \item Add README file at top level
+    \item Raw data are in a separate folder (and stay separate!)
+    \item Have a code folder
+    \item It is often a good idea to separate your data analysis from papers,
+      talks, etc. (especially if you want to publish your data)
+    \item Have designated folders where stuff is written to (e.\,g.,
+      \texttt{results}, \texttt{figures}, \texttt{processed}, etc.)
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]{Folder organisation}
+  {One possible example!}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node[text width = 2cm] (top) at  (0, 0) {\faIcon{folder} \verb+project+};
+    % first level
+    \node (n1) at (4, 0)    {\faIcon{folder} \verb+admin+};
+    \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
+    \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
+    \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
+    \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (top.east) -- (n1.west);
+    \path (top.east) -- (n2.west);
+    \path (top.east) -- (n3.west);
+    \path (top.east) -- (file.west);
+  \end{tikzpicture}
+  \vfill
+\end{frame}
+
+\begin{frame}[fragile]{Folder organisation}
+  {Analysis folder}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node[text width = 2cm] (top) at  (0, 0) {\faIcon{folder} \verb+project+};
+    % first level
+    \node (n1) at (4, 0)    {\faIcon{folder} \verb+admin+};
+    \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
+    \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
+    \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
+    \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (top.east) -- (n1.west);
+    \path (top.east) -- (n2.west);
+    \path (top.east) -- (n3.west);
+    \path (top.east) -- (file.west);
+    % second level
+    \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
+    \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
+    \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
+    \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
+    \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (n2.east) -- (o1.west);
+    \path (n2.east) -- (o2.west);
+    \path (n2.east) -- (o3.west);
+    \path (n2.east) -- (o4.west);
+    \path (n2.east) -- (o5.west);
+  \end{tikzpicture}
+  \vfill
+\end{frame}
+
+\begin{frame}[fragile]{Folder organisation}
+  {Analysis folder}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node[text width = 2cm] (top) at  (0, 0) {\faIcon{folder} \verb+project+};
+    % first level
+    \node (n1) at (4, 0)    {\faIcon{folder} \verb+admin+};
+    \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
+    \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
+    \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
+    \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (top.east) -- (n1.west);
+    \path (top.east) -- (n2.west);
+    \path (top.east) -- (n3.west);
+    \path (top.east) -- (file.west);
+    % second level
+    \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
+    \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
+    \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
+    \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
+    \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (n2.east) -- (o1.west);
+    \path (n2.east) -- (o2.west);
+    \path (n2.east) -- (o3.west);
+    \path (n2.east) -- (o4.west);
+    \path (n2.east) -- (o5.west);
+    % third level
+    \node[text width = 5cm] (p1) at (12, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
+    \node[text width = 5cm] (p2) at (12, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
+    \node[text width = 5cm] (p3) at (12, -1.4) {\faIcon[regular]{file} \verb+03_modeling.R+};
+    \node[text width = 5cm] (p4) at (12, -2.1) {\faIcon[regular]{file} \verb+04_plots.R+};
+    \path (o1.center) -- (p1.west);
+    \path (o1.center) -- (p2.west);
+    \path (o1.center) -- (p3.west);
+    \path (o1.center) -- (p4.west);
+  \end{tikzpicture}
+  \vfill
+\end{frame}
+
+\begin{frame}[fragile]{Folder organisation}
+  {Analysis folder}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node[text width = 2cm] (top) at  (0, 0) {\faIcon{folder} \verb+project+};
+    % first level
+    \node (n1) at (4, 0)    {\faIcon{folder} \verb+admin+};
+    \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
+    \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
+    \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
+    \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (top.east) -- (n1.west);
+    \path (top.east) -- (n2.west);
+    \path (top.east) -- (n3.west);
+    \path (top.east) -- (file.west);
+    % second level
+    \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
+    \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
+    \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
+    \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
+    \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (n2.east) -- (o1.west);
+    \path (n2.east) -- (o2.west);
+    \path (n2.east) -- (o3.west);
+    \path (n2.east) -- (o4.west);
+    \path (n2.east) -- (o5.west);
+    % third level
+    \node[text width = 5cm] (p1) at (12, 0) {\faIcon[regular]{file} \verb+subj1_ses01.txt+};
+    \node[text width = 5cm] (p2) at (12, -0.7) {\faIcon[regular]{file} \verb+subj1_ses02.txt+};
+    \node[text width = 5cm] (p3) at (12, -1.4) {\faIcon[regular]{file} \verb+subj2_ses01.txt+};
+    \node[text width = 5cm] (p4) at (12, -2.1) {\faIcon[regular]{file} \verb+subj2_ses02.txt+};
+    \node[text width = 5cm] (p5) at (12, -2.8) {\faIcon[regular]{file} \dots};
+    \path (o2.center) -- (p1.west);
+    \path (o2.center) -- (p2.west);
+    \path (o2.center) -- (p3.west);
+    \path (o2.center) -- (p4.west);
+    \path (o2.center) -- (p5.west);
+  \end{tikzpicture}
+  \vfill
+\end{frame}
+
+\begin{frame}[fragile]{Folder organisation}
+  {Analysis folder}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node[text width = 2cm] (top) at  (0, 0) {\faIcon{folder} \verb+project+};
+    % first level
+    \node (n1) at (4, 0)    {\faIcon{folder} \verb+admin+};
+    \node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
+    \node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
+    \node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
+    \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (top.east) -- (n1.west);
+    \path (top.east) -- (n2.west);
+    \path (top.east) -- (n3.west);
+    \path (top.east) -- (file.west);
+    % second level
+    \node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
+    \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
+    \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
+    \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
+    \node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (n2.east) -- (o1.west);
+    \path (n2.east) -- (o2.west);
+    \path (n2.east) -- (o3.west);
+    \path (n2.east) -- (o4.west);
+    \path (n2.east) -- (o5.west);
+    % third level
+    \node[text width = 5cm] (p1) at (12, -0.7) {\faIcon[regular]{file}
+    \verb+data_all-subj.csv+};
+    \node[text width = 5cm] (p2) at (12, -1.4) {\faIcon[regular]{file}
+    \verb+data_all-subj.RData+};
+    \node[text width = 5cm] (p3) at (12, -2.1) {\faIcon[regular]{file}
+    \verb+eval_model1.csv+};
+    \node[text width = 5cm] (p4) at (12, -2.8) {\faIcon[regular]{file}
+    \verb+eval_model2.csv+};
+    \path (o4.center) -- (p1.west);
+    \path (o4.center) -- (p2.west);
+    \path (o4.center) -- (p3.west);
+    \path (o4.center) -- (p4.west);
+  \end{tikzpicture}
+  \vfill
+  \pause
+  The analysis folder you might want to share on OSF, Github, etc.
+\end{frame}
+
+\begin{frame}[fragile]{Folder organisation}
+  {Dissemination folder}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node[text width = 2cm] (top) at  (0, 0) {\faIcon{folder} \verb+project+};
+    % first level
+    \node (n1) at (4, 0)    {\faIcon{folder} \verb+admin+};
+    \node (n2) at (4, -0.7) {\faIcon{folder} \verb+analysis+};
+    \node[text width = 3.2cm] (n3) at (3.6, -1.4) {\faIcon{folder} \verb+dissemination+};
+    \node (n4) at (4, -2.1) {\faIcon{folder} \verb+material+};
+    \node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (top.east) -- (n1.west);
+    \path (top.east) -- (n2.west);
+    \path (top.east) -- (n3.west);
+    \path (top.east) -- (file.west);
+    % second level
+    \node (o1) at (8.5, 0) {\faIcon{folder} \verb+paper+};
+    \node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+talks+};
+    \node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
+    \node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
+    \node (o5) at (8.5, -2.8) {\faIcon{folder} \verb+tables+};
+    \path (n3.east) -- (o1.west);
+    \path (n3.east) -- (o2.west);
+    \path (n3.east) -- (o3.west);
+    \path (n3.east) -- (o4.west);
+    \path (n3.east) -- (o5.west);
+  \end{tikzpicture}
+  \vfill
+  \pause
+  Having separate folders for figures and tables helps you keep track of them
+  for your paper and talks
+\end{frame}
+
+\begin{frame}[fragile]{Figures and tables}
+  \begin{itemize}
+    \item Most of us (including me!) are not at a stage where we are
+      writing our papers or talks as reproducible documents
+      \pause
+    \item It is still a good idea to create tables and figures in R and keep the
+      code easily accessible
+      \pause
+    \item One suggestion
+
+      \begin{tikzpicture}[
+    every node/.style = {text width = 4.2cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    % figures
+    \node (fig) at  (0, 0) {\faIcon{folder} \verb+figures+};
+    \node (n1) at  (4, 0) {\faIcon[regular]{file} \verb+h1_barplot.R+};
+    \node (n2) at  (4, -0.7) {\faIcon[regular]{file} \verb+h1_barplot.png+};
+    \path (fig.center) -- (n1.west);
+    \path (fig.center) -- (n2.west);
+    % tables
+    \node (tab) at  (0, -1.5) {\faIcon{folder} \verb+tables+};
+    \node (o1) at  (4, -1.5) {\faIcon[regular]{file} \verb+h1_mean-table.Rmd+};
+    \node (o2) at  (4, -2.2) {\faIcon[regular]{file} \verb+h1_mean-table.docx+};
+    \path (tab.center) -- (o1.west);
+    \path (tab.center) -- (o2.west);
+      \end{tikzpicture}
+      \pause
+    \item I export the data for figures and tables from \texttt{analysis/code}
+      to \texttt{dissemination/results} so the dissemination folder is
+      self-contained
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]{Several data sources}
+  \begin{itemize}
+    \item When you have several different data sources like questionnaires and
+      eye-tracking data keep them in separate folders
+      \begin{tikzpicture}[
+    every node/.style = {text width = 4cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node (data) at  (0, 0) {\faIcon{folder} \verb+data+};
+    \node (n1) at  (4, 0) {\faIcon{folder} \verb+eyetracking+};
+    \node (n2) at  (4, -0.7) {\faIcon{folder} \verb+qualtrics+};
+    \path (data.center) -- (n1.west);
+    \path (data.center) -- (n2.west);
+      \end{tikzpicture}
+      \pause
+    \item Process them separately, e.\,g., with
+      \verb+01a_preprocessing_eyetracking.R+ and
+      \verb+01b_preprocessing_surveys.R+ and then \verb+02_combine-data.R+
+    \begin{tikzpicture}[
+    every node/.style = {text width = 5cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node (results) at  (0, 0) {\faIcon{folder} \verb+results+};
+    \node (n1) at  (4, 0) {\faIcon[regular]{file} \verb+data_eyetracking.csv+};
+    \node (n2) at  (4, -0.7) {\faIcon[regular]{file} \verb+data_surveys.csv+};
+    \node (n3) at  (4, -1.4) {\faIcon[regular]{file} \verb+data_complete.csv+};
+    \path (results.center) -- (n1.west);
+    \path (results.center) -- (n2.west);
+    \path (results.center) -- (n3.west);
+      \end{tikzpicture}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Toy example with 11 questions}
+  Thank you everybody for filling out our little toy survey in Qualtrics!
+  \vfill
+  \tiny
+\begin{tabular}{lllll}
+  \hline
+            ResponseId &      age &            sex & data\_sharing\_1 & data\_sharing\_2 \\ 
+  \hline
+R\_225ffqhb7qRaIGO:1   & Min.   :24.00   & m            : 2   & No :7   & Min.   :1.000   \\ 
+  R\_2F9fXxf3NedHqZl:1   & 1st Qu.:26.50   & f            :11   & Yes:7   & 1st Qu.:1.000   \\ 
+  R\_2foYj4iSgaBTkEO:1   & Median :28.00   & d            : 1   &  & Median :2.000   \\ 
+  R\_2J9B4aLaasQ1m81:1   & Mean   :29.86   & not indicated: 0   &  & Mean   :2.214   \\ 
+  R\_2P1TMDNlwm0gSIk:1   & 3rd Qu.:30.00   &  &  & 3rd Qu.:2.000   \\ 
+  R\_2pXfOSq8DBImG6R:1   & Max.   :43.00   &  &  & Max.   :6.000   \\ 
+  (Other)          :8   &  &  &  &  \\ 
+   \hline
+\end{tabular}
+
+  \vspace{.5cm}
+\begin{tabular}{lllllll}
+  \hline
+ rdm\_stmnt\_1 &  rdm\_stmnt\_2 &  rdm\_stmnt\_3 &  rdm\_stmnt\_4 &  rdm\_stmnt\_5 &           career\_level\_1 & career\_level\_2 \\ 
+  \hline
+Min.   :2.000   & Min.   :2   & Min.   :2.000   & Min.   :1.000   & Min.   :1.000   & Student          : 0   & Min.   : 1.000   \\ 
+  1st Qu.:3.250   & 1st Qu.:4   & 1st Qu.:2.250   & 1st Qu.:1.000   & 1st Qu.:1.000   & PhD student      :11   & 1st Qu.: 1.625   \\ 
+  Median :4.500   & Median :4   & Median :3.000   & Median :1.000   & Median :1.000   & Postdoc          : 1   & Median : 2.500   \\ 
+  Mean   :4.071   & Mean   :4   & Mean   :2.857   & Mean   :1.143   & Mean   :1.143   & Senior researcher: 0   & Mean   : 5.964   \\ 
+  3rd Qu.:5.000   & 3rd Qu.:5   & 3rd Qu.:3.000   & 3rd Qu.:1.000   & 3rd Qu.:1.000   & Professor        : 1   & 3rd Qu.: 4.500   \\ 
+  Max.   :5.000   & Max.   :5   & Max.   :5.000   & Max.   :2.000   & Max.   :2.000   & Other            : 1   & Max.   :38.000   \\ 
+   & NA's   :1   &  &  &  &  &  \\ 
+   \hline
+\end{tabular}
+\end{frame}
+
+% print(xtable::xtable(summary(dat[, 1:5])), include.rownames = FALSE)
+% print(xtable::xtable(summary(dat[, 6:12])), include.rownames = FALSE)
+
+\begin{frame}[fragile]{Folder structure for toy example}
+  {One possible structure!}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 4.3cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node (ex) at  (0, 0) {\faIcon{folder} \verb+example+};
+    \node (n1) at  (3, 0) {\faIcon{folder} \verb+code+};
+    \node (n2) at  (3, -0.7) {\faIcon{folder} \verb+data+};
+    \node (n3) at  (3, -1.4) {\faIcon[regular]{file} \verb+README.md+};
+    \path (ex.center) -- (n1.west);
+    \path (ex.center) -- (n2.west);
+    \path (ex.center) -- (n3.west);
+
+    \node (o1) at (7, 0.7) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
+    \node (o2) at (7, -0.7) {\faIcon{folder} \verb+codebook+};
+    \node (o3) at (7, -1.4) {\faIcon{folder} \verb+rawdata+};
+    \node (o4) at (7, -2.1) {\faIcon{folder} \verb+results+};
+    \path (n1.center) -- (o1.west);
+    \path (n2.center) -- (o2.west);
+    \path (n2.center) -- (o3.west);
+    \path (n2.center) -- (o4.west);
+
+    \node (p1) at (11, -0.7) {\faIcon[regular]{file} \verb+codebook_01.R+};
+    \node (p2) at (11, -1.4) {\faIcon[regular]{file} \verb+codebook_01.xlsx+};
+    \node (p3) at (11, -2.1) {\dots};
+
+    \path (o2.center) -- (p1.west);
+    \path (o2.center) -- (p2.west);
+    \path (o2.center) -- (p3.west);
+  \end{tikzpicture}
+
+\end{frame}
+
+\section{Metadata}
+
+\begin{frame}{Metadata answers questions}
+  \begin{itemize}
+    \item {\bf Who} created the data?
+    \item {\bf Why} was the data created?
+    \item {\bf When} was the data created?
+    \item {\bf Where} is the data?
+    \item {\bf How} was the data created?
+    \item {\bf What} is the content of the data?
+  \end{itemize}
+  \vfill
+  \hfill{\tiny \citet{Wilbrandt2023}}
+\end{frame}
+
+\begin{frame}{Metadata}
+  \begin{block}{Metadata}
+  \dots is data about data.\\
+  \dots can be \emph{descriptive}, \emph{structural}, or \emph{administrative}.
+  \end{block}
+  \vfill
+  \begin{columns}
+    \begin{column}[t]{.5\textwidth}
+      Contains information on origin and background of data like
+        \begin{itemize}
+          \item Who, when, why, how, \dots
+          \item Used resources
+          \item Used abbreviations, units, names
+          \item Licenses
+          \item \dots
+        \end{itemize}
+    \end{column}
+    \begin{column}[t]{.5\textwidth}
+      Data can be anything like
+        \begin{itemize}
+          \item Book content
+          \item Pictures or audio files
+          \item Website content or a blog post
+          \item Journal paper
+          \item Research data
+          \item \dots
+        \end{itemize}
+    \end{column}
+  \end{columns}
+  \vfill
+\end{frame}
+
+\begin{frame}{Metadata examples}
+{Photo}
+  \begin{center}
+    \includegraphics[scale = .31]{../figures/metadata_photo}
+  \end{center}
+\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
+\end{frame}
+
+\begin{frame}{Metadata examples}
+{Book}
+  \begin{center}
+    \includegraphics[scale = .36]{../figures/metadata_book}
+  \end{center}
+\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
+\end{frame}
+
+\begin{frame}{Metadata examples}
+{Webpage}
+  \begin{center}
+    \includegraphics[scale = .27]{../figures/metadata_webpage}
+  \end{center}
+\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
+\end{frame}
+
+\begin{frame}{Metadata examples}
+{WORD document}
+  \begin{center}
+    \includegraphics[scale = .23]{../figures/metadata_word_document}
+  \end{center}
+\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
+\end{frame}
+
+\begin{frame}{Metadata for research data}
+\begin{tikzpicture}
+\node[font=\Large] (n1) at (0,0) {\bf \color{iwmorange} Study};
+
+\node[font=\large] (i1) at (0,-1) {$\bullet$ Persons};
+\node[font=\large] (i2) at (.36,-1.5) {$\bullet$ Background};
+\node[font=\large] (i3) at (.03,-2) {$\bullet$ Funding};
+\node[font=\large] (i4) at (-.38,-2.5) {$\bullet$ \dots};
+\node[draw=iwmorange, thick, fit={(n1) (i1) (i2) (i3) (i4)}, inner sep=10pt] (box) {};
+
+\node[font=\Large] (n2) at (5,0) {\bf \color{iwmorange} Data set};
+
+\node[font=\large] (j1) at (4.3,-1) {$\bullet$ Files};
+\node[font=\large] (j2) at (4.57,-1.5) {$\bullet$ Sources};
+\node[font=\large] (j3) at (4.65,-2) {$\bullet$ Methods};
+\node[font=\large] (j4) at (4.18,-2.5) {$\bullet$ \dots};
+\node[draw=iwmorange, thick, fit={(n2) (j1) (j2) (j3) (j4)}, inner sep=10pt] (box) {};
+
+\node[font=\Large] (n3) at (10,0) {\bf \color{iwmorange} Variables};
+
+\node[font=\large] (k1) at (9.7,-1) {$\bullet$ Data type};
+\node[font=\large] (k2) at (9.69,-1.5) {$\bullet$ Scale unit};
+\node[font=\large] (k3) at (9.85,-2) {$\bullet$ Value range};
+\node[font=\large] (k4) at (9.12,-2.5) {$\bullet$ \dots};
+\node[draw=iwmorange, thick, fit={(n3) (k1) (k2) (k3) (k4)}, inner sep=10pt] (box) {};
+
+\draw[-latex, thick] (n1) -- (n2);
+\draw[-latex, thick] (n2) -- (n3);
+\end{tikzpicture}
+  \vfill
+  \hfill\tiny \url{https://datamanagement.hms.harvard.edu/collect/readme-files}
+\end{frame}
+
+\section{README files}
+
+\begin{frame}{README files}
+  \begin{itemize}
+    \item Can be used to give information about all levels in a research
+      project: study/project, data set, variables; either in one README or in
+      several ones
+    \item Should provide a clear and concise description of all relevant details
+      about data collection, processing, and analysis
+    \item README files are created for different purposes:
+      \begin{itemize}
+        \item to document changes to files or file names within a folder
+        \item to explain file naming conventions, practices, etc.\ ``in
+          general'' for future reference
+        \item to specifically accompany files/data being deposited in a
+          repository
+      \end{itemize}
+    \item Creating a README file at the beginning of your research process,
+      and updating it consistently throughout your research, will help you
+      to compile a final README file when your data is ready for deposit
+    \item Find a template here:
+      \url{https://cornell.app.box.com/v/ReadmeTemplate}
+  \end{itemize}
+  \vfill
+  \hfill\tiny \url{https://datamanagement.hms.harvard.edu/collect/readme-files}
+\end{frame}
+
+\begin{frame}{Study/project}{README on top level}
+  \begin{itemize}
+    \item Project name and purpose
+    \item Funding information (process number!)
+    \item Ethics approved? LEK number!
+    \item Person(s) responsible for study conduction
+    \item One or several studies? Infos about them
+    \item Time/Duration of project 
+    \item \dots
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Data set}{README accompanying data set(s)}
+  \begin{itemize}
+    \item One or more data sets?
+    \item Time of data collection
+    \item Person(s) responsible for data collection
+    \item File organisation
+    \item Naming conventions
+    \item Preprocessing methods
+    \item Anything that is special about the data set(s)
+    \item Number of subjects
+    \item Variables
+    \item \dots
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Variables}{README accompanying a specific data set}
+  \begin{itemize}
+    \item You can use a README (or text file called \texttt{codebook.txt} or
+      similar) to document your variables
+    \item Especially, if you only have a few variables, this is an easy and fast
+      way to document them
+    \item If you are working with extensive surveys or questionnaires, it might
+      be a good time investment to create a more elaborate codebook
+  \end{itemize}
+  \vfill
+\end{frame}
+
+\section{Codebooks}
+
+\begin{frame}{What information about variables should a codebook include?}
+    % slido
+  \centering
+  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 3}
+
+  \url{https://app.sli.do/event/3S1Bn3Tjknuk5J5WiqAYzG}
+\end{frame}
+
+\begin{frame}{A codebook should include}
+  \begin{tabular}{lp{11cm}}
+    \hline
+    Variable name & Usually some abbreviation like \texttt{pna01} \\
+    Variable label & Brief description to identify variable \\
+    Question text & If applicable, exact wording from survey question \\
+    Values & Values variable can take (e.\,g, 1 to 5) \\
+    Value labels & If applicable, textual descriptions of the values \\
+    Statistics & For example, range, mean, standard deviation for
+    numeric variables; frequencies and percentages for categorical variables \\
+    Missing data & If applicable, values and labels of missing data \\
+    Notes & Additional notes, remarks, or comments; for measures or
+    questions from copyrighted instruments, the notes field can be used to
+    cite the source \\
+    \hline
+  \end{tabular}
+  \vfill
+
+  \hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983}
+\end{frame}
+
+\begin{frame}{Codebooks}
+  \begin{itemize}
+    \item There are many different ways to create a codebook
+    \item It can be a README, some other plain text file, a table (stored as CSV
+      or XLSX), a WORD document, or PDF
+    \item For a short questionnaire, it can be sufficient to export it as a PDF
+    \item Let's walk through a couple of options\dots
+  \end{itemize}
+  \vfill
+\end{frame}
+
+\begin{frame}{Option 1 -- Toy example with 11 questions}
+  {Simple PDF}
+  \begin{columns}
+    \begin{column}{.5\textwidth}
+      \begin{center}
+        \vspace{-.4cm}
+        Export from Qualtrics\\
+        \includegraphics[scale = .3]{../figures/codebook_1.png}
+      \end{center}
+    \end{column}
+    \begin{column}{.6\textwidth}
+      \begin{itemize}
+        \item For a simple questionnaire like this, the exported WORD document
+          from Qualtrics exported to PDF might be sufficient as a codebook
+        \item For longer questionnaires, the WORD document can still be a good
+          starting point to create a more elaborate codebook
+      \end{itemize}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Option 2 -- Toy example with 11 questions}
+  {Plain text file}
+      \begin{center}
+        \vspace{-.3cm}
+        \footnotesize
+\begin{lstlisting}[language = bash, identifierstyle=\color{iwmgray}]
+  sex. Please indicate your sex.
+  -------------------------------------------------------------------------------
+  -1. m
+  -2. f
+  -3. d
+  -4. not indicated
+  
+  age. How old are you? Please enter your age in years.
+  -------------------------------------------------------------------------------
+  numerical input
+  
+  data_sharing_1. Have you ever published data in a repository?
+  -------------------------------------------------------------------------------
+  -1. No
+  -2. Yes
+\end{lstlisting}
+      \end{center}
+\end{frame}
+
+\begin{frame}[fragile]{Option 3 -- Toy example with 11 questions}
+  {Creating a simple codebook in R ``by hand''}
+\footnotesize
+  \begin{lstlisting}
+load("results/data_rdm-ms-ss2024_cleaned.RData")
+codebook <- data.frame(var_name = names(dat),
+                       var_text = c("Response Id", "Please indicate your sex.",
+                       "How old are you? Please enter your age in years.",
+                       ...
+                       "Sharing data is bad scientific practice",
+                       "What is your current career level?",
+                       "How long have you been working in science (in years)?"))
+
+codebook$type <- sapply(dat, class)
+codebook$n    <- sapply(dat, length)
+codebook$mean <- sapply(dat,
+                   function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA))
+codebook$sd   <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA))
+
+openxlsx::write.xlsx(codebook, file = "codebook/codebook_01.xlsx")
+  \end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Option 3 -- Toy example with 11 questions}
+  {Creating a simple codebook in R ``by hand''}
+  \begin{center}
+    \includegraphics[scale = .6]{../figures/codebook_2.png}
+  \end{center}
+\end{frame}
+
+\begin{frame}[fragile]{Option 4 -- Toy example with 11 questions}
+  {Using the codebook package in R}
+  \begin{itemize}
+    \item When you export a qualtrics questionnaire as SPSS file and import it
+      into R using the haven package, you can use RMarkdown to create an
+      elaborate HTML codebook
+    \item It works best for classical questionnaire items
+    \item In our example, the survey is not formatted well enough for the
+      generated codebook to be completely correct
+  \end{itemize}
+  \footnotesize
+  \begin{lstlisting}
+#' ---
+#' title: Codebook for Data Set "RDM MS SS 2024"
+#' author: Nora Wickelmaier
+#' ---
+
+#+ echo = FALSE
+dat <- haven::read_spss("../rawdata/RDM_MS_SS2024_download_2024-06-04.sav")
+codebook::codebook(dat)
+  \end{lstlisting}
+\end{frame}
+
+\appendix
+%%\begin{frame}[allowframebreaks]{References}
+\begin{frame}{References}
+%\renewcommand{\bibfont}{\small}
+  \printbibliography
+\vfill
+\end{frame}
+
+\end{document}
+
--- a/03_data_organisation/example/README.md
+++ b/03_data_organisation/example/README.md
@ -0,0 +1,47 @@
+# Toy data set for the methods seminar on data management SS2024
+
+## Responsible person
+
+Nora Wickelmaier
+Referentin Forschungsmethoden und Forschungsdatenmanagement
+Leibniz-Institut für Wissensmedien (IWM)
+n.wickelmaier@iwm-tuebingen.de
+
+## Folder structure and naming conventions
+
+```
+/example/
+|
+|- /code/
+|- /data/
+    |- /codebook/
+    |- /rawdata/
+    |- /results/
+```
+
+The `code` folder contains analysis scripts written in R. The scripts are
+numbered, indicating the order they should be executed in.
+
+The `data` folder contains all folders associated with data and its
+documentation.
+
+The `code` folder with contains different codebook options and R scripts that
+create these codebooks. If the codebook is created by an R script, the script
+and the codebook are named identically, e.g., `codebook_01.R` and
+`codebook_01.xslx`.
+
+The `rawdata` folder contains the downloads from Qualtrics. In Qualtrics, the
+variables have been selected and ordered and then downloaded, without the
+additional columns Qualtrics adds by default. The naming convention for the
+downloaded files is
+```
+RDM_MS_SS2024_download_<YYYY-MM-DD>.<fileending>
+```
+No other files than the downloads from qualtrics should go into this folder!
+
+The `results` folder contains processed data. The scripts in `/code/` process
+the data from `/rawdata/` and saves the files containing the processed data to
+`/results/`. Data can be exported as CSV files or RData files. If different file
+formats contain the same data, they should be named identically, e.g.,
+`data_rdm-ms-ss2024_cleaned.csv` and `data_rdm-ms-ss2024_cleaned.RData`.
+
--- a/03_data_organisation/example/code/01_preprocessing.R
+++ b/03_data_organisation/example/code/01_preprocessing.R
@ -0,0 +1,78 @@
+# 01_preprocessing.R
+#
+# Cleaning up data for toy data set Methods Seminar SS2024
+#
+# Input: RDM_MS_SS2024_download_2024-06-07.csv
+# Output: results/data_rdm-ms-ss2024_cleaned.csv
+#         results/data_rdm-ms-ss2024_cleaned.RData
+#
+# created: 2024-06-03
+
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/")
+
+dat <- read.table("data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv",
+                  sep = ",", skip = 3, stringsAsFactors = TRUE, na.string = "")
+
+names(dat) <-
+  readLines("data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv", 1) |>
+  strsplit(split = ",") |>
+  unlist()
+
+# Clean up variables
+dat$ResponseId <- factor(dat$ResponseId)
+
+dat$sex <- factor(dat$sex,
+                  levels = c("m", "f", "d", "not indicated"))
+
+dat$data_sharing_1 <- factor(dat$data_sharing_1,
+                             levels = c("No", "Yes"))
+
+dat$career_level_1 <- factor(dat$career_level_1,
+                             levels = c("Student", "PhD student", "Postdoc",
+                                        "Senior researcher", "Professor",
+                                        "Other"))
+
+dat$rdm_stmnt_1 <- factor(dat$rdm_stmnt_1,
+                          levels = c("Strongly disagree", "Disagree",
+                                     "Neither agree nor disagree", "Agree",
+                                     "Strongly agree"))
+
+dat$rdm_stmnt_2 <- factor(dat$rdm_stmnt_2,
+                          levels = c("Strongly disagree", "Disagree",
+                                     "Neither agree nor disagree", "Agree",
+                                     "Strongly agree"))
+
+dat$rdm_stmnt_3 <- factor(dat$rdm_stmnt_3,
+                          levels = c("Strongly disagree", "Disagree",
+                                     "Neither agree nor disagree", "Agree",
+                                     "Strongly agree"))
+
+dat$rdm_stmnt_4 <- factor(dat$rdm_stmnt_4,
+                          levels = c("Strongly disagree", "Disagree",
+                                     "Neither agree nor disagree", "Agree",
+                                     "Strongly agree"))
+
+dat$rdm_stmnt_5 <- factor(dat$rdm_stmnt_5,
+                          levels = c("Strongly disagree", "Disagree",
+                                     "Neither agree nor disagree", "Agree",
+                                     "Strongly agree"))
+
+## Fix data_sharing_2
+dat$data_sharing_2[dat$data_sharing_2 == "1 out of 4"] <- 1
+dat$data_sharing_2 <- as.numeric(dat$data_sharing_2)
+
+# Create numeric statement variables
+
+dat$rdm_stmnt_1 <- as.numeric(dat$rdm_stmnt_1)
+dat$rdm_stmnt_2 <- as.numeric(dat$rdm_stmnt_2)
+dat$rdm_stmnt_3 <- as.numeric(dat$rdm_stmnt_3)
+dat$rdm_stmnt_4 <- as.numeric(dat$rdm_stmnt_5)
+dat$rdm_stmnt_4 <- as.numeric(dat$rdm_stmnt_4)
+dat$rdm_stmnt_5 <- as.numeric(dat$rdm_stmnt_5)
+
+# Save cleaned data set
+write.table(dat, file = "data/results/data_rdm-ms-ss2024_cleaned.csv", sep = ";",
+            row.names = FALSE, quote = FALSE)
+
+save(dat, file = "data/results/data_rdm-ms-ss2024_cleaned.RData")
+
--- a/03_data_organisation/example/data/codebook/RDM_MS_SS2024.docx
+++ b/03_data_organisation/example/data/codebook/RDM_MS_SS2024.docx
--- a/03_data_organisation/example/data/codebook/codebook_01.R
+++ b/03_data_organisation/example/data/codebook/codebook_01.R
@ -0,0 +1,41 @@
+# codebook_generation_01.R
+#
+# Code generation example
+#
+# Input:  results/data_rdm-ms-ss2024_cleaned.RData
+# Output: 
+#
+# created: 2024-06-04
+
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/")
+
+load("data/results/data_rdm-ms-ss2024_cleaned.RData")
+
+codebook <- data.frame(var_name = names(dat),
+                       var_text = c("Response Id", "Please indicate your sex.",
+                       "How old are you? Please enter your age in years.",
+                       "Have you ever published data in a repository?",
+                       "How many of your data sets have you published so far?",
+                       "All my analyses are preregistered",
+                       "Sharing my data is very important to me",
+                       "I invest more time in research data management than my colleagues",
+                       "I think research data management is overrated",
+                       "Sharing data is bad scientific practice",
+                       "What is your current career level?",
+                       "How long have you been working in science (in years)?")
+
+)
+
+codebook$type <- sapply(dat, class)
+codebook$n <- sapply(dat, length)
+codebook$mean <- sapply(dat, function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA))
+codebook$sd <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA))
+
+write.table(codebook,
+            file = "data/codebook/codebook_01.csv",
+            na = "",
+            sep = ";",
+            quote = FALSE) 
+
+openxlsx::write.xlsx(codebook, file = "data/codebook/codebook_01.xlsx")
+
--- a/03_data_organisation/example/data/codebook/codebook_01.csv
+++ b/03_data_organisation/example/data/codebook/codebook_01.csv
@ -0,0 +1,13 @@
+var_name;var_text;type;n;mean;sd
+1;ResponseId;Response Id;factor;13;;
+2;age;Please indicate your sex.;integer;13;29.6923076923077;5.99144689515278
+3;sex;How old are you? Please enter your age in years.;factor;13;;
+4;data_sharing_1;Have you ever published data in a repository?;factor;13;;
+5;data_sharing_2;How many of your data sets have you published so far?;numeric;13;2.30769230769231;1.65250392761083
+6;rdm_stmnt_1;All my analyses are preregistered;numeric;13;4.15384615384615;1.14354374979373
+7;rdm_stmnt_2;Sharing my data is very important to me;numeric;13;4;
+8;rdm_stmnt_3;I invest more time in research data management than my colleagues;numeric;13;2.84615384615385;0.800640769025436
+9;rdm_stmnt_4;I think research data management is overrated;numeric;13;1.15384615384615;0.375533808099405
+10;rdm_stmnt_5;Sharing data is bad scientific practice;numeric;13;1.15384615384615;0.375533808099405
+11;career_level_1;What is your current career level?;factor;13;;
+12;career_level_2;How long have you been working in science (in years)?;numeric;13;6.26923076923077;10.1788493632126
--- a/03_data_organisation/example/data/codebook/codebook_01.xlsx
+++ b/03_data_organisation/example/data/codebook/codebook_01.xlsx
--- a/03_data_organisation/example/data/codebook/codebook_02.R
+++ b/03_data_organisation/example/data/codebook/codebook_02.R
@ -0,0 +1,68 @@
+# codebook_generation_01.R
+#
+# Code generation example
+#
+# Input:  results/data_rdm-ms-ss2024_cleaned.RData
+# Output: 
+#
+# created: 2024-06-04
+
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/teaching/iwm/data_management/03_data_organisation/example/")
+
+dat <- as.data.frame(haven::read_spss("data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav"))
+
+## Fix data_sharing_2
+dat$data_sharing_2[dat$data_sharing_2 == "1 out of 4"] <- 1
+
+# Look at attributes
+attributes(dat$sex)
+
+# Create codebook with survey questions
+codebook <- data.frame(variable = names(dat),
+                       label = sapply(dat, function(x) attr(x, "label")))
+
+# Clean up data frame
+dat <- as.data.frame(lapply(dat, sjlabelled::unlabel))
+
+sapply(dat, class)  # Look at classes
+
+dat$age <- as.numeric(dat$age)
+dat$career_level_2 <- as.numeric(dat$career_level_2)
+dat$data_sharing_2 <- as.numeric(dat$data_sharing_2)
+
+dat$sex <- factor(dat$sex,
+                  levels = 1:4,
+                  labels = names(attr(dat$sex, "labels")))
+dat$data_sharing_1 <- factor(dat$data_sharing_1,
+                             levels = 1:2,
+                             labels = names(attr(dat$data_sharing_1, "labels")))
+dat$career_level_1 <- factor(dat$career_level_1,
+                             levels = 1:6,
+                             labels = names(attr(dat$career_level_1, "labels")))
+
+# Add descriptive statistics to codebook
+
+codebook$n <- sapply(dat, length)
+codebook$type <- sapply(dat, class)
+codebook$mean <- sapply(dat, function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA))
+codebook$sd <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA))
+
+# props <- function(x) {
+#   if (is.factor(x)) {
+#     proportions(summary(x))
+#   } else {
+#     NA
+#   }
+# }
+# 
+# codebook$prop <- lapply(dat, props)
+
+write.table(codebook,
+            file = "data/codebook/codebook_02.csv",
+            na = "",
+            sep = ";",
+            quote = FALSE) 
+
+openxlsx::write.xlsx(codebook, file = "data/codebook/codebook_02.xlsx")
+
+
--- a/03_data_organisation/example/data/codebook/codebook_02.csv
+++ b/03_data_organisation/example/data/codebook/codebook_02.csv
@ -0,0 +1,13 @@
+variable;label;n;type;mean;sd
+ResponseId;ResponseId;Response ID;13;character;;
+age;age;How old are you? Please enter your age in years.;13;numeric;29.6923076923077;5.99144689515278
+sex;sex;Please indicate your sex.;13;factor;;
+data_sharing_1;data_sharing_1;Have you ever published data in a repository?;13;factor;;
+data_sharing_2;data_sharing_2;How many of your data sets have you published so far?;13;numeric;1.38461538461538;1.85015591858549
+rdm_stmnt_1;rdm_stmnt_1;Please indicate how much you agree with the following statements - All my analyses are preregistered;13;numeric;4.15384615384615;1.14354374979373
+rdm_stmnt_2;rdm_stmnt_2;Please indicate how much you agree with the following statements - Sharing my data is very important to me;13;numeric;4;
+rdm_stmnt_3;rdm_stmnt_3;Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues;13;numeric;2.84615384615385;0.800640769025436
+rdm_stmnt_4;rdm_stmnt_4;Please indicate how much you agree with the following statements - I think research data management is overrated;13;numeric;1.84615384615385;0.987096233585649
+rdm_stmnt_5;rdm_stmnt_5;Please indicate how much you agree with the following statements - Sharing data is bad scientific practice;13;numeric;1.15384615384615;0.375533808099405
+career_level_1;career_level_1;What is your current career level?;13;factor;;
+career_level_2;career_level_2;How long have you been working in science (in years)?;13;numeric;6.26923076923077;10.1788493632126
--- a/03_data_organisation/example/data/codebook/codebook_02.xlsx
+++ b/03_data_organisation/example/data/codebook/codebook_02.xlsx
--- a/03_data_organisation/example/data/codebook/codebook_03.R
+++ b/03_data_organisation/example/data/codebook/codebook_03.R
@ -0,0 +1,10 @@
+#' ---
+#' title: Codebook for Data Set "RDM MS SS 2024"
+#' author: Nora Wickelmaier
+#' ---
+
+#+ echo = FALSE
+dat <- haven::read_spss("../rawdata/RDM_MS_SS2024_download_2024-06-04.sav")
+
+codebook::codebook(dat)
+
--- a/03_data_organisation/example/data/codebook/codebook_manual.txt
+++ b/03_data_organisation/example/data/codebook/codebook_manual.txt
@ -0,0 +1,94 @@
+###############################################################################
+This file contains an overview of the variables from a toy data set collected
+at the methods seminar SS 2024. The raw data contain in
+"RDM_MS_SS2024_download_2024-06-03_v1.csv" contain additional variables
+created by Qualtrics. The variables have been preprocessed and are stored in
+"data_rdm-ms-ss2024_cleaned.csv".
+###############################################################################
+
+
+ResponseId. <Qualtrics ID of subject>
+-------------------------------------------------------------------------------
+random sequence of numbers, letters, and underscore
+
+
+sex. Please indicate your sex.
+-------------------------------------------------------------------------------
+-1. m
+-2. f
+-3. d
+-4. not indicated
+
+
+age. How old are you? Please enter your age in years.
+-------------------------------------------------------------------------------
+numerical input
+
+
+data_sharing_1. Have you ever published data in a repository?
+-------------------------------------------------------------------------------
+-1. No
+-2. Yes
+
+
+data_sharing_2. How many of your data sets have you published so far?
+-------------------------------------------------------------------------------
+numerical input
+
+
+rdm_stmnt. Please indicate how much you agree with the following statements:
+
+rdm_stmnt_1. All my analyses are preregistered
+-------------------------------------------------------------------------------
+-1. Strongly disagree
+-2. Disagree
+-3. Neither agree nor disagree
+-4. Agree
+-5. Strongly agree
+
+rdm_stmnt_2. Sharing my data is very important to me
+-------------------------------------------------------------------------------
+-1. Strongly disagree
+-2. Disagree
+-3. Neither agree nor disagree
+-4. Agree
+-5. Strongly agree
+
+rdm_stmnt_3. I invest more time in research data management than my colleagues
+-------------------------------------------------------------------------------
+-1. Strongly disagree
+-2. Disagree
+-3. Neither agree nor disagree
+-4. Agree
+-5. Strongly agree
+
+rdm_stmnt_4. I think research data management is overrated
+-------------------------------------------------------------------------------
+-1. Strongly disagree
+-2. Disagree
+-3. Neither agree nor disagree
+-4. Agree
+-5. Strongly agree
+
+rdm_stmnt_5. Sharing data is bad scientific practice
+-------------------------------------------------------------------------------
+-1. Strongly disagree
+-2. Disagree
+-3. Neither agree nor disagree
+-4. Agree
+-5. Strongly agree
+
+
+career_level_1. What is your current career level?
+-------------------------------------------------------------------------------
+-1. Student
+-2. PhD student
+-3. Postdoc
+-4. Senior researcher
+-5 Professor
+-6. Other
+
+
+career_level_2. How long have you been working in science (in years)?
+-------------------------------------------------------------------------------
+numerical input
--- a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.csv
+++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.csv
@ -0,0 +1,16 @@
+ResponseId,age,sex,data_sharing_1,data_sharing_2,rdm_stmnt_1,rdm_stmnt_2,rdm_stmnt_3,rdm_stmnt_4,rdm_stmnt_5,career_level_1,career_level_2
+Response ID,How old are you? Please enter your age in years.,Please indicate your sex.,Have you ever published data in a repository?,How many of your data sets have you published so far?,Please indicate how much you agree with the following statements - All my analyses are preregistered,Please indicate how much you agree with the following statements - Sharing my data is very important to me,Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues,Please indicate how much you agree with the following statements - I think research data management is overrated,Please indicate how much you agree with the following statements - Sharing data is bad scientific practice,What is your current career level?,How long have you been working in science (in years)?
+"{""ImportId"":""_recordId""}","{""ImportId"":""QID3_TEXT""}","{""ImportId"":""QID1""}","{""ImportId"":""QID4""}","{""ImportId"":""QID7_TEXT""}","{""ImportId"":""QID2_1""}","{""ImportId"":""QID2_2""}","{""ImportId"":""QID2_3""}","{""ImportId"":""QID2_4""}","{""ImportId"":""QID2_5""}","{""ImportId"":""QID8""}","{""ImportId"":""QID9_TEXT""}"
+R_8q7OpSkcuPT7SbI,42,f,No,1,Neither agree nor disagree,Agree,Strongly agree,Strongly disagree,Strongly disagree,Other,14
+R_8Io4pbk0A1a37VL,28,f,Yes,1,Strongly agree,,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,1
+R_2J9B4aLaasQ1m81,28,f,Yes,1 out of 4,Strongly agree,Strongly agree,Disagree,Disagree,Strongly disagree,PhD student,3
+R_80kqWr3W48SgiUZ,43,f,Yes,6,Agree,Agree,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,3
+R_8QpI8T0rjTjaPPr,30,f,Yes,4,Strongly agree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,5
+R_8QoVv6THz1Qjtuz,28,f,Yes,1,Disagree,Disagree,Disagree,Agree,Strongly disagree,Professor,38
+R_2F9fXxf3NedHqZl,25,d,No,0,Agree,Strongly agree,Disagree,Neither agree nor disagree,Disagree,PhD student,2
+R_2foYj4iSgaBTkEO,24,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1
+R_83T6Oak5vI6GNJ7,30,f,Yes,1,Strongly agree,Agree,Neither agree nor disagree,Neither agree nor disagree,Strongly disagree,Postdoc,7
+R_2Vz26rWsOLYwqnD,25,m,Yes,3,Agree,Agree,Neither agree nor disagree,Disagree,Disagree,PhD student,2
+R_8HcBgUUm1BXFfhv,29,m,No,0,Strongly agree,Disagree,Disagree,Strongly disagree,Strongly disagree,PhD student,3
+R_2P1TMDNlwm0gSIk,26,f,No,0,Disagree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1.5
+R_225ffqhb7qRaIGO,28,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1
--- a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav
+++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sav
--- a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sps
+++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-04.sps
@ -0,0 +1,66 @@
+* Encoding: UTF-8.
+TITLE "RDM_MS_SS2024".
+SUBTITLE "".
+VARIABLE LABELS
+ResponseId "Response ID"
+age "How old are you? Please enter your age in years."
+sex "Please indicate your sex."
+data_sharing_1 "Have you ever published data in a repository?"
+data_sharing_2 "How many of your data sets have you published so far?"
+rdm_stmnt_1 "Please indicate how much you agree with the following statements - All my analyses are preregistered"
+rdm_stmnt_2 "Please indicate how much you agree with the following statements - Sharing my data is very important to me"
+rdm_stmnt_3 "Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues"
+rdm_stmnt_4 "Please indicate how much you agree with the following statements - I think research data management is overrated"
+rdm_stmnt_5 "Please indicate how much you agree with the following statements - Sharing data is bad scientific practice"
+career_level_1 "What is your current career level?"
+career_level_2 "How long have you been working in science (in years)?"
+.
+VALUE LABELS
+	/sex
+		1 "m"
+		2 "f"
+		3 "d"
+		4 "not indicated"
+	/data_sharing_1
+		1 "No"
+		2 "Yes"
+	/rdm_stmnt_1
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_2
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_3
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_4
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_5
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/career_level_1
+		1 "Student"
+		2 "PhD student"
+		3 "Postdoc"
+		4 "Senior researcher"
+		5 "Professor"
+		6 "Other"
+.
+CACHE.
+EXECUTE.
--- a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv
+++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.csv
@ -0,0 +1,17 @@
+ResponseId,age,sex,data_sharing_1,data_sharing_2,rdm_stmnt_1,rdm_stmnt_2,rdm_stmnt_3,rdm_stmnt_4,rdm_stmnt_5,career_level_1,career_level_2
+Response ID,How old are you? Please enter your age in years.,Please indicate your sex.,Have you ever published data in a repository?,How many of your data sets have you published so far?,Please indicate how much you agree with the following statements - All my analyses are preregistered,Please indicate how much you agree with the following statements - Sharing my data is very important to me,Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues,Please indicate how much you agree with the following statements - I think research data management is overrated,Please indicate how much you agree with the following statements - Sharing data is bad scientific practice,What is your current career level?,How long have you been working in science (in years)?
+"{""ImportId"":""_recordId""}","{""ImportId"":""QID3_TEXT""}","{""ImportId"":""QID1""}","{""ImportId"":""QID4""}","{""ImportId"":""QID7_TEXT""}","{""ImportId"":""QID2_1""}","{""ImportId"":""QID2_2""}","{""ImportId"":""QID2_3""}","{""ImportId"":""QID2_4""}","{""ImportId"":""QID2_5""}","{""ImportId"":""QID8""}","{""ImportId"":""QID9_TEXT""}"
+R_8q7OpSkcuPT7SbI,42,f,No,1,Neither agree nor disagree,Agree,Strongly agree,Strongly disagree,Strongly disagree,Other,14
+R_8Io4pbk0A1a37VL,28,f,Yes,1,Strongly agree,,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,1
+R_2J9B4aLaasQ1m81,28,f,Yes,1 out of 4,Strongly agree,Strongly agree,Disagree,Disagree,Strongly disagree,PhD student,3
+R_80kqWr3W48SgiUZ,43,f,Yes,6,Agree,Agree,Neither agree nor disagree,Disagree,Strongly disagree,PhD student,3
+R_8QpI8T0rjTjaPPr,30,f,Yes,4,Strongly agree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,5
+R_8QoVv6THz1Qjtuz,28,f,Yes,1,Disagree,Disagree,Disagree,Agree,Strongly disagree,Professor,38
+R_2F9fXxf3NedHqZl,25,d,No,0,Agree,Strongly agree,Disagree,Neither agree nor disagree,Disagree,PhD student,2
+R_2foYj4iSgaBTkEO,24,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1
+R_83T6Oak5vI6GNJ7,30,f,Yes,1,Strongly agree,Agree,Neither agree nor disagree,Neither agree nor disagree,Strongly disagree,Postdoc,7
+R_2Vz26rWsOLYwqnD,25,m,Yes,3,Agree,Agree,Neither agree nor disagree,Disagree,Disagree,PhD student,2
+R_8HcBgUUm1BXFfhv,29,m,No,0,Strongly agree,Disagree,Disagree,Strongly disagree,Strongly disagree,PhD student,3
+R_2P1TMDNlwm0gSIk,26,f,No,0,Disagree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1.5
+R_225ffqhb7qRaIGO,28,f,No,0,Strongly agree,Strongly agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,1
+R_2pXfOSq8DBImG6R,32,f,No,0,Neither agree nor disagree,Agree,Neither agree nor disagree,Strongly disagree,Strongly disagree,PhD student,2
--- a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sav
+++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sav
--- a/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sps
+++ b/03_data_organisation/example/data/rawdata/RDM_MS_SS2024_download_2024-06-07.sps
@ -0,0 +1,66 @@
+* Encoding: UTF-8.
+TITLE "RDM_MS_SS2024".
+SUBTITLE "".
+VARIABLE LABELS
+ResponseId "Response ID"
+age "How old are you? Please enter your age in years."
+sex "Please indicate your sex."
+data_sharing_1 "Have you ever published data in a repository?"
+data_sharing_2 "How many of your data sets have you published so far?"
+rdm_stmnt_1 "Please indicate how much you agree with the following statements - All my analyses are preregistered"
+rdm_stmnt_2 "Please indicate how much you agree with the following statements - Sharing my data is very important to me"
+rdm_stmnt_3 "Please indicate how much you agree with the following statements - I invest more time in research data management than my colleagues"
+rdm_stmnt_4 "Please indicate how much you agree with the following statements - I think research data management is overrated"
+rdm_stmnt_5 "Please indicate how much you agree with the following statements - Sharing data is bad scientific practice"
+career_level_1 "What is your current career level?"
+career_level_2 "How long have you been working in science (in years)?"
+.
+VALUE LABELS
+	/sex
+		1 "m"
+		2 "f"
+		3 "d"
+		4 "not indicated"
+	/data_sharing_1
+		1 "No"
+		2 "Yes"
+	/rdm_stmnt_1
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_2
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_3
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_4
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/rdm_stmnt_5
+		1 "Strongly disagree"
+		2 "Disagree"
+		3 "Neither agree nor disagree"
+		4 "Agree"
+		5 "Strongly agree"
+	/career_level_1
+		1 "Student"
+		2 "PhD student"
+		3 "Postdoc"
+		4 "Senior researcher"
+		5 "Professor"
+		6 "Other"
+.
+CACHE.
+EXECUTE.
--- a/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.RData
+++ b/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.RData
--- a/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.csv
+++ b/03_data_organisation/example/data/results/data_rdm-ms-ss2024_cleaned.csv
@ -0,0 +1,15 @@
+ResponseId;age;sex;data_sharing_1;data_sharing_2;rdm_stmnt_1;rdm_stmnt_2;rdm_stmnt_3;rdm_stmnt_4;rdm_stmnt_5;career_level_1;career_level_2
+R_8q7OpSkcuPT7SbI;42;f;No;2;3;4;5;1;1;Other;14
+R_8Io4pbk0A1a37VL;28;f;Yes;2;5;NA;3;1;1;PhD student;1
+R_2J9B4aLaasQ1m81;28;f;Yes;2;5;5;2;1;1;PhD student;3
+R_80kqWr3W48SgiUZ;43;f;Yes;6;4;4;3;1;1;PhD student;3
+R_8QpI8T0rjTjaPPr;30;f;Yes;5;5;4;3;1;1;PhD student;5
+R_8QoVv6THz1Qjtuz;28;f;Yes;2;2;2;2;1;1;Professor;38
+R_2F9fXxf3NedHqZl;25;d;No;1;4;5;2;2;2;PhD student;2
+R_2foYj4iSgaBTkEO;24;f;No;1;5;5;3;1;1;PhD student;1
+R_83T6Oak5vI6GNJ7;30;f;Yes;2;5;4;3;1;1;Postdoc;7
+R_2Vz26rWsOLYwqnD;25;m;Yes;4;4;4;3;2;2;PhD student;2
+R_8HcBgUUm1BXFfhv;29;m;No;1;5;2;2;1;1;PhD student;3
+R_2P1TMDNlwm0gSIk;26;f;No;1;2;4;3;1;1;PhD student;1.5
+R_225ffqhb7qRaIGO;28;f;No;1;5;5;3;1;1;PhD student;1
+R_2pXfOSq8DBImG6R;32;f;No;1;3;4;3;1;1;PhD student;2
--- a/figures/QR
+++ b/figures/QR
--- a/figures/codebook_1.png
+++ b/figures/codebook_1.png
--- a/figures/codebook_2.png
+++ b/figures/codebook_2.png
--- a/figures/email_data_request_2024_03.png
+++ b/figures/email_data_request_2024_03.png
--- a/figures/email_data_request_2024_04.png
+++ b/figures/email_data_request_2024_04.png