845 lines
31 KiB
TeX
845 lines
31 KiB
TeX
\documentclass[aspectratio=169]{beamer}
|
|
|
|
\usepackage{listings}
|
|
\usepackage[utf8,latin1]{inputenc}
|
|
\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
|
|
\addbibresource{../literature/lit.bib}
|
|
|
|
\usepackage{fancyvrb}
|
|
\usepackage{fontawesome5} % get icons
|
|
\usepackage{multirow}
|
|
\usepackage{color, colortbl}
|
|
|
|
\usepackage{tikz}
|
|
\usetikzlibrary{fit}
|
|
\usepackage[edges]{forest}
|
|
|
|
\lstset{language=R,%
|
|
backgroundcolor=\color{iwmgray!15!white},
|
|
basicstyle=\ttfamily\color{iwmgray},
|
|
frame=none,
|
|
commentstyle=\slshape\color{iwmgreen},
|
|
keywordstyle=\bfseries\color{iwmgray},
|
|
identifierstyle=\color{iwmpurple},
|
|
stringstyle=\color{iwmblue},
|
|
numbers=none,%left,numberstyle=\tiny,
|
|
basewidth={.5em, .4em},
|
|
showstringspaces=false,
|
|
emphstyle=\color{red!50!white}}
|
|
|
|
\makeatletter \def\newblock{\beamer@newblock} \makeatother
|
|
|
|
\beamertemplatenavigationsymbolsempty
|
|
\setbeamertemplate{itemize items}[circle]
|
|
\setbeamertemplate{section in toc}[circle]
|
|
\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
|
|
\setbeamercolor{block body}{bg=iwmorange!50!white}
|
|
\setbeamercolor{block title}{fg=white, bg=iwmorange}
|
|
% Definitions for biblatex
|
|
\setbeamercolor{bibliography entry note}{fg=iwmgray}
|
|
\setbeamercolor{bibliography entry author}{fg=iwmgray}
|
|
\setbeamertemplate{bibliography item}{}
|
|
|
|
\definecolor{iwmorange}{RGB}{255,105,0}
|
|
\definecolor{iwmgray}{RGB}{67,79,79}
|
|
\definecolor{iwmblue}{RGB}{60,180,220}
|
|
\definecolor{iwmgreen}{RGB}{145,200,110}
|
|
\definecolor{iwmpurple}{RGB}{120,0,75}
|
|
|
|
\setbeamercolor{title}{fg=iwmorange}
|
|
\setbeamercolor{frametitle}{fg=iwmorange}
|
|
\setbeamercolor{structure}{fg=iwmorange}
|
|
\setbeamercolor{normal text}{fg=iwmgray}
|
|
\setbeamercolor{author}{fg=iwmgray}
|
|
\setbeamercolor{date}{fg=iwmgray}
|
|
|
|
\newcommand{\vect}[1]{\mathbf{#1}}
|
|
\newcommand{\mat}[1]{\mathbf{#1}}
|
|
\newcommand{\gvect}[1]{\boldsymbol{#1}}
|
|
\newcommand{\gmat}[1]{\boldsymbol{#1}}
|
|
|
|
\AtBeginSection[]{
|
|
\frame{
|
|
\tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
|
|
|
|
\setbeamertemplate{headline}{
|
|
\begin{beamercolorbox}{section in head}
|
|
\vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
|
|
\end{beamercolorbox}
|
|
}
|
|
|
|
\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
|
|
|
|
\title{Data organisation for effective research data management}
|
|
\author{Nora Wickelmaier}
|
|
\date{June 10, 2024}
|
|
|
|
\begin{document}
|
|
|
|
\begin{frame}{}
|
|
\thispagestyle{empty}
|
|
\titlepage
|
|
\end{frame}
|
|
|
|
\begin{frame}{Data request}
|
|
\begin{center}
|
|
\includegraphics[scale = .55]{../figures/email_data_request_2024_01}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Data folder for the data requested}
|
|
\begin{center}
|
|
\includegraphics[scale = .6]{../figures/email_data_request_2024_03}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}{What is bad about this data organisation?}
|
|
% slido
|
|
\centering
|
|
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 3}
|
|
|
|
\url{https://app.sli.do/event/3S1Bn3Tjknuk5J5WiqAYzG}
|
|
\end{frame}
|
|
|
|
\begin{frame}[<+->]{Bad things about this data organisation}
|
|
\begin{itemize}
|
|
\item Raw and processed data are in the same folder
|
|
\item File naming does not sort in a sensible way: Best order would be first
|
|
by subject, then by session
|
|
\item Data and data scripts are in the same folder
|
|
\item Data scripts are not numbered, unclear in what order they need to be
|
|
executed
|
|
\item There are plot files (PDFs) between the data and code files
|
|
\item It is unclear which are the final and processed data files
|
|
\item The final data files are not stored in an interoperable format: There
|
|
is only an \texttt{.RData} file that (probably) contains the final data
|
|
which was used for further analyses
|
|
\item There is no documentation whatsoever
|
|
\item \dots
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Topics for this semester}
|
|
\centering
|
|
\begin{tabular}{ll}
|
|
\hline
|
|
Date & Topic \\
|
|
\hline
|
|
2024-05-13 & Introduction to data management \\
|
|
2024-05-27 & Workflow \\
|
|
\only<1>{2024-06-10}\only<2>{\bf 2024-06-10} & \only<1>{Data organisation}\only<2>{\bf Data organisation}\\
|
|
2024-06-24 & Data sharing \\
|
|
2024-07-08 & Clean coding \\
|
|
2024-07-22 & Version control \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
% * different data sources
|
|
% * content README file
|
|
% * best arrangement of data
|
|
% * redundancy
|
|
% * anonymizing/pseudonymizing data
|
|
|
|
\section{Folder organisation}
|
|
|
|
\begin{frame}[<+->]{Some general rules}
|
|
\begin{itemize}
|
|
\item One project, one folder
|
|
\item Add README file at top level
|
|
\item Raw data are in a separate folder (and stay separate!)
|
|
\item Have a code folder
|
|
\item It is often a good idea to separate your data analysis from papers,
|
|
talks, etc. (especially if you want to publish your data)
|
|
\item Have designated folders where stuff is written to (e.\,g.,
|
|
\texttt{results}, \texttt{figures}, \texttt{processed}, etc.)
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Folder organisation}
|
|
{One possible example!}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+};
|
|
% first level
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+admin+};
|
|
\node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
|
|
\node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
|
|
\node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
|
|
\node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (top.east) -- (n1.west);
|
|
\path (top.east) -- (n2.west);
|
|
\path (top.east) -- (n3.west);
|
|
\path (top.east) -- (file.west);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Folder organisation}
|
|
{Analysis folder}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+};
|
|
% first level
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+admin+};
|
|
\node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
|
|
\node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
|
|
\node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
|
|
\node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (top.east) -- (n1.west);
|
|
\path (top.east) -- (n2.west);
|
|
\path (top.east) -- (n3.west);
|
|
\path (top.east) -- (file.west);
|
|
% second level
|
|
\node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
|
|
\node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
|
|
\node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
|
|
\node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
|
|
\node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (n2.east) -- (o1.west);
|
|
\path (n2.east) -- (o2.west);
|
|
\path (n2.east) -- (o3.west);
|
|
\path (n2.east) -- (o4.west);
|
|
\path (n2.east) -- (o5.west);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Folder organisation}
|
|
{Analysis folder}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+};
|
|
% first level
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+admin+};
|
|
\node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
|
|
\node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
|
|
\node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
|
|
\node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (top.east) -- (n1.west);
|
|
\path (top.east) -- (n2.west);
|
|
\path (top.east) -- (n3.west);
|
|
\path (top.east) -- (file.west);
|
|
% second level
|
|
\node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
|
|
\node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
|
|
\node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
|
|
\node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
|
|
\node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (n2.east) -- (o1.west);
|
|
\path (n2.east) -- (o2.west);
|
|
\path (n2.east) -- (o3.west);
|
|
\path (n2.east) -- (o4.west);
|
|
\path (n2.east) -- (o5.west);
|
|
% third level
|
|
\node[text width = 5cm] (p1) at (12, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
|
|
\node[text width = 5cm] (p2) at (12, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
|
|
\node[text width = 5cm] (p3) at (12, -1.4) {\faIcon[regular]{file} \verb+03_modeling.R+};
|
|
\node[text width = 5cm] (p4) at (12, -2.1) {\faIcon[regular]{file} \verb+04_plots.R+};
|
|
\path (o1.center) -- (p1.west);
|
|
\path (o1.center) -- (p2.west);
|
|
\path (o1.center) -- (p3.west);
|
|
\path (o1.center) -- (p4.west);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Folder organisation}
|
|
{Analysis folder}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+};
|
|
% first level
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+admin+};
|
|
\node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
|
|
\node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
|
|
\node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
|
|
\node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (top.east) -- (n1.west);
|
|
\path (top.east) -- (n2.west);
|
|
\path (top.east) -- (n3.west);
|
|
\path (top.east) -- (file.west);
|
|
% second level
|
|
\node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
|
|
\node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
|
|
\node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
|
|
\node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
|
|
\node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (n2.east) -- (o1.west);
|
|
\path (n2.east) -- (o2.west);
|
|
\path (n2.east) -- (o3.west);
|
|
\path (n2.east) -- (o4.west);
|
|
\path (n2.east) -- (o5.west);
|
|
% third level
|
|
\node[text width = 5cm] (p1) at (12, 0) {\faIcon[regular]{file} \verb+subj1_ses01.txt+};
|
|
\node[text width = 5cm] (p2) at (12, -0.7) {\faIcon[regular]{file} \verb+subj1_ses02.txt+};
|
|
\node[text width = 5cm] (p3) at (12, -1.4) {\faIcon[regular]{file} \verb+subj2_ses01.txt+};
|
|
\node[text width = 5cm] (p4) at (12, -2.1) {\faIcon[regular]{file} \verb+subj2_ses02.txt+};
|
|
\node[text width = 5cm] (p5) at (12, -2.8) {\faIcon[regular]{file} \dots};
|
|
\path (o2.center) -- (p1.west);
|
|
\path (o2.center) -- (p2.west);
|
|
\path (o2.center) -- (p3.west);
|
|
\path (o2.center) -- (p4.west);
|
|
\path (o2.center) -- (p5.west);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Folder organisation}
|
|
{Analysis folder}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+};
|
|
% first level
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+admin+};
|
|
\node[text width = 3cm] (n2) at (3.5, -0.7) {\faIcon{folder} \verb+analysis+};
|
|
\node (n4) at (4, -1.4) {\faIcon{folder} \verb+dissemination+};
|
|
\node (n3) at (4, -2.1) {\faIcon{folder} \verb+material+};
|
|
\node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (top.east) -- (n1.west);
|
|
\path (top.east) -- (n2.west);
|
|
\path (top.east) -- (n3.west);
|
|
\path (top.east) -- (file.west);
|
|
% second level
|
|
\node (o1) at (8.5, 0) {\faIcon{folder} \verb+code+};
|
|
\node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+data+};
|
|
\node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
|
|
\node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
|
|
\node (o5) at (8.5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (n2.east) -- (o1.west);
|
|
\path (n2.east) -- (o2.west);
|
|
\path (n2.east) -- (o3.west);
|
|
\path (n2.east) -- (o4.west);
|
|
\path (n2.east) -- (o5.west);
|
|
% third level
|
|
\node[text width = 5cm] (p1) at (12, -0.7) {\faIcon[regular]{file}
|
|
\verb+data_all-subj.csv+};
|
|
\node[text width = 5cm] (p2) at (12, -1.4) {\faIcon[regular]{file}
|
|
\verb+data_all-subj.RData+};
|
|
\node[text width = 5cm] (p3) at (12, -2.1) {\faIcon[regular]{file}
|
|
\verb+eval_model1.csv+};
|
|
\node[text width = 5cm] (p4) at (12, -2.8) {\faIcon[regular]{file}
|
|
\verb+eval_model2.csv+};
|
|
\path (o4.center) -- (p1.west);
|
|
\path (o4.center) -- (p2.west);
|
|
\path (o4.center) -- (p3.west);
|
|
\path (o4.center) -- (p4.west);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\pause
|
|
The analysis folder you might want to share on OSF, Github, etc.
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Folder organisation}
|
|
{Dissemination folder}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node[text width = 2cm] (top) at (0, 0) {\faIcon{folder} \verb+project+};
|
|
% first level
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+admin+};
|
|
\node (n2) at (4, -0.7) {\faIcon{folder} \verb+analysis+};
|
|
\node[text width = 3.2cm] (n3) at (3.6, -1.4) {\faIcon{folder} \verb+dissemination+};
|
|
\node (n4) at (4, -2.1) {\faIcon{folder} \verb+material+};
|
|
\node (file) at (4, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (top.east) -- (n1.west);
|
|
\path (top.east) -- (n2.west);
|
|
\path (top.east) -- (n3.west);
|
|
\path (top.east) -- (file.west);
|
|
% second level
|
|
\node (o1) at (8.5, 0) {\faIcon{folder} \verb+paper+};
|
|
\node (o2) at (8.5, -0.7) {\faIcon{folder} \verb+talks+};
|
|
\node (o3) at (8.5, -1.4) {\faIcon{folder} \verb+figures+};
|
|
\node (o4) at (8.5, -2.1) {\faIcon{folder} \verb+results+};
|
|
\node (o5) at (8.5, -2.8) {\faIcon{folder} \verb+tables+};
|
|
\path (n3.east) -- (o1.west);
|
|
\path (n3.east) -- (o2.west);
|
|
\path (n3.east) -- (o3.west);
|
|
\path (n3.east) -- (o4.west);
|
|
\path (n3.east) -- (o5.west);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\pause
|
|
Having separate folders for figures and tables helps you keep track of them
|
|
for your paper and talks
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Figures and tables}
|
|
\begin{itemize}
|
|
\item Most of us (including me!) are not at a stage where we are
|
|
writing our papers or talks as reproducible documents
|
|
\pause
|
|
\item It is still a good idea to create tables and figures in R and keep the
|
|
code easily accessible
|
|
\pause
|
|
\item One suggestion
|
|
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4.2cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
% figures
|
|
\node (fig) at (0, 0) {\faIcon{folder} \verb+figures+};
|
|
\node (n1) at (4, 0) {\faIcon[regular]{file} \verb+h1_barplot.R+};
|
|
\node (n2) at (4, -0.7) {\faIcon[regular]{file} \verb+h1_barplot.png+};
|
|
\path (fig.center) -- (n1.west);
|
|
\path (fig.center) -- (n2.west);
|
|
% tables
|
|
\node (tab) at (0, -1.5) {\faIcon{folder} \verb+tables+};
|
|
\node (o1) at (4, -1.5) {\faIcon[regular]{file} \verb+h1_mean-table.Rmd+};
|
|
\node (o2) at (4, -2.2) {\faIcon[regular]{file} \verb+h1_mean-table.docx+};
|
|
\path (tab.center) -- (o1.west);
|
|
\path (tab.center) -- (o2.west);
|
|
\end{tikzpicture}
|
|
\pause
|
|
\item I export the data for figures and tables from \texttt{analysis/code}
|
|
to \texttt{dissemination/results} so the dissemination folder is
|
|
self-contained
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Several data sources}
|
|
\begin{itemize}
|
|
\item When you have several different data sources like questionnaires and
|
|
eye-tracking data keep them in separate folders
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node (data) at (0, 0) {\faIcon{folder} \verb+data+};
|
|
\node (n1) at (4, 0) {\faIcon{folder} \verb+eyetracking+};
|
|
\node (n2) at (4, -0.7) {\faIcon{folder} \verb+qualtrics+};
|
|
\path (data.center) -- (n1.west);
|
|
\path (data.center) -- (n2.west);
|
|
\end{tikzpicture}
|
|
\pause
|
|
\item Process them separately, e.\,g., with
|
|
\verb+01a_preprocessing_eyetracking.R+ and
|
|
\verb+01b_preprocessing_surveys.R+ and then \verb+02_combine-data.R+
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 5cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node (results) at (0, 0) {\faIcon{folder} \verb+results+};
|
|
\node (n1) at (4, 0) {\faIcon[regular]{file} \verb+data_eyetracking.csv+};
|
|
\node (n2) at (4, -0.7) {\faIcon[regular]{file} \verb+data_surveys.csv+};
|
|
\node (n3) at (4, -1.4) {\faIcon[regular]{file} \verb+data_complete.csv+};
|
|
\path (results.center) -- (n1.west);
|
|
\path (results.center) -- (n2.west);
|
|
\path (results.center) -- (n3.west);
|
|
\end{tikzpicture}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Toy example with 11 questions}
|
|
Thank you everybody for filling out our little toy survey in Qualtrics!
|
|
\vfill
|
|
\tiny
|
|
\begin{tabular}{lllll}
|
|
\hline
|
|
ResponseId & age & sex & data\_sharing\_1 & data\_sharing\_2 \\
|
|
\hline
|
|
R\_225ffqhb7qRaIGO:1 & Min. :24.00 & m : 2 & No :7 & Min. :1.000 \\
|
|
R\_2F9fXxf3NedHqZl:1 & 1st Qu.:26.50 & f :11 & Yes:7 & 1st Qu.:1.000 \\
|
|
R\_2foYj4iSgaBTkEO:1 & Median :28.00 & d : 1 & & Median :2.000 \\
|
|
R\_2J9B4aLaasQ1m81:1 & Mean :29.86 & not indicated: 0 & & Mean :2.214 \\
|
|
R\_2P1TMDNlwm0gSIk:1 & 3rd Qu.:30.00 & & & 3rd Qu.:2.000 \\
|
|
R\_2pXfOSq8DBImG6R:1 & Max. :43.00 & & & Max. :6.000 \\
|
|
(Other) :8 & & & & \\
|
|
\hline
|
|
\end{tabular}
|
|
|
|
\vspace{.5cm}
|
|
\begin{tabular}{lllllll}
|
|
\hline
|
|
rdm\_stmnt\_1 & rdm\_stmnt\_2 & rdm\_stmnt\_3 & rdm\_stmnt\_4 & rdm\_stmnt\_5 & career\_level\_1 & career\_level\_2 \\
|
|
\hline
|
|
Min. :2.000 & Min. :2 & Min. :2.000 & Min. :1.000 & Min. :1.000 & Student : 0 & Min. : 1.000 \\
|
|
1st Qu.:3.250 & 1st Qu.:4 & 1st Qu.:2.250 & 1st Qu.:1.000 & 1st Qu.:1.000 & PhD student :11 & 1st Qu.: 1.625 \\
|
|
Median :4.500 & Median :4 & Median :3.000 & Median :1.000 & Median :1.000 & Postdoc : 1 & Median : 2.500 \\
|
|
Mean :4.071 & Mean :4 & Mean :2.857 & Mean :1.143 & Mean :1.143 & Senior researcher: 0 & Mean : 5.964 \\
|
|
3rd Qu.:5.000 & 3rd Qu.:5 & 3rd Qu.:3.000 & 3rd Qu.:1.000 & 3rd Qu.:1.000 & Professor : 1 & 3rd Qu.: 4.500 \\
|
|
Max. :5.000 & Max. :5 & Max. :5.000 & Max. :2.000 & Max. :2.000 & Other : 1 & Max. :38.000 \\
|
|
& NA's :1 & & & & & \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
% print(xtable::xtable(summary(dat[, 1:5])), include.rownames = FALSE)
|
|
% print(xtable::xtable(summary(dat[, 6:12])), include.rownames = FALSE)
|
|
|
|
\begin{frame}[fragile]{Folder structure for toy example}
|
|
{One possible structure!}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 4.3cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node (ex) at (0, 0) {\faIcon{folder} \verb+example+};
|
|
\node (n1) at (3, 0) {\faIcon{folder} \verb+code+};
|
|
\node (n2) at (3, -0.7) {\faIcon{folder} \verb+data+};
|
|
\node (n3) at (3, -1.4) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (ex.center) -- (n1.west);
|
|
\path (ex.center) -- (n2.west);
|
|
\path (ex.center) -- (n3.west);
|
|
|
|
\node (o1) at (7, 0.7) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
|
|
\node (o2) at (7, -0.7) {\faIcon{folder} \verb+codebook+};
|
|
\node (o3) at (7, -1.4) {\faIcon{folder} \verb+rawdata+};
|
|
\node (o4) at (7, -2.1) {\faIcon{folder} \verb+results+};
|
|
\path (n1.center) -- (o1.west);
|
|
\path (n2.center) -- (o2.west);
|
|
\path (n2.center) -- (o3.west);
|
|
\path (n2.center) -- (o4.west);
|
|
|
|
\node (p1) at (11, -0.7) {\faIcon[regular]{file} \verb+codebook_01.R+};
|
|
\node (p2) at (11, -1.4) {\faIcon[regular]{file} \verb+codebook_01.xlsx+};
|
|
\node (p3) at (11, -2.1) {\dots};
|
|
|
|
\path (o2.center) -- (p1.west);
|
|
\path (o2.center) -- (p2.west);
|
|
\path (o2.center) -- (p3.west);
|
|
\end{tikzpicture}
|
|
|
|
\end{frame}
|
|
|
|
\section{Metadata}
|
|
|
|
\begin{frame}{Metadata answers questions}
|
|
\begin{itemize}
|
|
\item {\bf Who} created the data?
|
|
\item {\bf Why} was the data created?
|
|
\item {\bf When} was the data created?
|
|
\item {\bf Where} is the data?
|
|
\item {\bf How} was the data created?
|
|
\item {\bf What} is the content of the data?
|
|
\end{itemize}
|
|
\vfill
|
|
\hfill{\tiny \citet{Wilbrandt2023}}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Metadata}
|
|
\begin{block}{Metadata}
|
|
\dots is data about data.\\
|
|
\dots can be \emph{descriptive}, \emph{structural}, or \emph{administrative}.
|
|
\end{block}
|
|
\vfill
|
|
\begin{columns}
|
|
\begin{column}[t]{.5\textwidth}
|
|
Contains information on origin and background of data like
|
|
\begin{itemize}
|
|
\item Who, when, why, how, \dots
|
|
\item Used resources
|
|
\item Used abbreviations, units, names
|
|
\item Licenses
|
|
\item \dots
|
|
\end{itemize}
|
|
\end{column}
|
|
\begin{column}[t]{.5\textwidth}
|
|
Data can be anything like
|
|
\begin{itemize}
|
|
\item Book content
|
|
\item Pictures or audio files
|
|
\item Website content or a blog post
|
|
\item Journal paper
|
|
\item Research data
|
|
\item \dots
|
|
\end{itemize}
|
|
\end{column}
|
|
\end{columns}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}{Metadata examples}
|
|
{Photo}
|
|
\begin{center}
|
|
\includegraphics[scale = .31]{../figures/metadata_photo}
|
|
\end{center}
|
|
\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Metadata examples}
|
|
{Book}
|
|
\begin{center}
|
|
\includegraphics[scale = .36]{../figures/metadata_book}
|
|
\end{center}
|
|
\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Metadata examples}
|
|
{Webpage}
|
|
\begin{center}
|
|
\includegraphics[scale = .27]{../figures/metadata_webpage}
|
|
\end{center}
|
|
\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Metadata examples}
|
|
{WORD document}
|
|
\begin{center}
|
|
\includegraphics[scale = .23]{../figures/metadata_word_document}
|
|
\end{center}
|
|
\hfill{\tiny \url{https://dataedo.com/kb/data-glossary/what-is-metadata}}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Metadata for research data}
|
|
\begin{tikzpicture}
|
|
\node[font=\Large] (n1) at (0,0) {\bf \color{iwmorange} Study};
|
|
|
|
\node[font=\large] (i1) at (0,-1) {$\bullet$ Persons};
|
|
\node[font=\large] (i2) at (.36,-1.5) {$\bullet$ Background};
|
|
\node[font=\large] (i3) at (.03,-2) {$\bullet$ Funding};
|
|
\node[font=\large] (i4) at (-.38,-2.5) {$\bullet$ \dots};
|
|
\node[draw=iwmorange, thick, fit={(n1) (i1) (i2) (i3) (i4)}, inner sep=10pt] (box) {};
|
|
|
|
\node[font=\Large] (n2) at (5,0) {\bf \color{iwmorange} Data set};
|
|
|
|
\node[font=\large] (j1) at (4.3,-1) {$\bullet$ Files};
|
|
\node[font=\large] (j2) at (4.57,-1.5) {$\bullet$ Sources};
|
|
\node[font=\large] (j3) at (4.65,-2) {$\bullet$ Methods};
|
|
\node[font=\large] (j4) at (4.18,-2.5) {$\bullet$ \dots};
|
|
\node[draw=iwmorange, thick, fit={(n2) (j1) (j2) (j3) (j4)}, inner sep=10pt] (box) {};
|
|
|
|
\node[font=\Large] (n3) at (10,0) {\bf \color{iwmorange} Variables};
|
|
|
|
\node[font=\large] (k1) at (9.7,-1) {$\bullet$ Data type};
|
|
\node[font=\large] (k2) at (9.69,-1.5) {$\bullet$ Scale unit};
|
|
\node[font=\large] (k3) at (9.85,-2) {$\bullet$ Value range};
|
|
\node[font=\large] (k4) at (9.12,-2.5) {$\bullet$ \dots};
|
|
\node[draw=iwmorange, thick, fit={(n3) (k1) (k2) (k3) (k4)}, inner sep=10pt] (box) {};
|
|
|
|
\draw[-latex, thick] (n1) -- (n2);
|
|
\draw[-latex, thick] (n2) -- (n3);
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\hfill\tiny \url{https://datamanagement.hms.harvard.edu/collect/readme-files}
|
|
\end{frame}
|
|
|
|
\section{README files}
|
|
|
|
\begin{frame}{README files}
|
|
\begin{itemize}
|
|
\item Can be used to give information about all levels in a research
|
|
project: study/project, data set, variables; either in one README or in
|
|
several ones
|
|
\item Should provide a clear and concise description of all relevant details
|
|
about data collection, processing, and analysis
|
|
\item README files are created for different purposes:
|
|
\begin{itemize}
|
|
\item to document changes to files or file names within a folder
|
|
\item to explain file naming conventions, practices, etc.\ ``in
|
|
general'' for future reference
|
|
\item to specifically accompany files/data being deposited in a
|
|
repository
|
|
\end{itemize}
|
|
\item Creating a README file at the beginning of your research process,
|
|
and updating it consistently throughout your research, will help you
|
|
to compile a final README file when your data is ready for deposit
|
|
\item Find a template here:
|
|
\url{https://cornell.app.box.com/v/ReadmeTemplate}
|
|
\end{itemize}
|
|
\vfill
|
|
\hfill\tiny \url{https://datamanagement.hms.harvard.edu/collect/readme-files}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Study/project}{README on top level}
|
|
\begin{itemize}
|
|
\item Project name and purpose
|
|
\item Funding information (process number!)
|
|
\item Ethics approved? LEK number!
|
|
\item Person(s) responsible for study conduction
|
|
\item One or several studies? Infos about them
|
|
\item Time/Duration of project
|
|
\item \dots
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Data set}{README accompanying data set(s)}
|
|
\begin{itemize}
|
|
\item One or more data sets?
|
|
\item Time of data collection
|
|
\item Person(s) responsible for data collection
|
|
\item File organisation
|
|
\item Naming conventions
|
|
\item Preprocessing methods
|
|
\item Anything that is special about the data set(s)
|
|
\item Number of subjects
|
|
\item Variables
|
|
\item \dots
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Variables}{README accompanying a specific data set}
|
|
\begin{itemize}
|
|
\item You can use a README (or text file called \texttt{codebook.txt} or
|
|
similar) to document your variables
|
|
\item Especially, if you only have a few variables, this is an easy and fast
|
|
way to document them
|
|
\item If you are working with extensive surveys or questionnaires, it might
|
|
be a good time investment to create a more elaborate codebook
|
|
\end{itemize}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\section{Codebooks}
|
|
|
|
\begin{frame}{What information about variables should a codebook include?}
|
|
% slido
|
|
\centering
|
|
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 3}
|
|
|
|
\url{https://app.sli.do/event/3S1Bn3Tjknuk5J5WiqAYzG}
|
|
\end{frame}
|
|
|
|
\begin{frame}{A codebook should include}
|
|
\begin{tabular}{lp{11cm}}
|
|
\hline
|
|
Variable name & Usually some abbreviation like \texttt{pna01} \\
|
|
Variable label & Brief description to identify variable \\
|
|
Question text & If applicable, exact wording from survey question \\
|
|
Values & Values variable can take (e.\,g, 1 to 5) \\
|
|
Value labels & If applicable, textual descriptions of the values \\
|
|
Statistics & For example, range, mean, standard deviation for
|
|
numeric variables; frequencies and percentages for categorical variables \\
|
|
Missing data & If applicable, values and labels of missing data \\
|
|
Notes & Additional notes, remarks, or comments; for measures or
|
|
questions from copyrighted instruments, the notes field can be used to
|
|
cite the source \\
|
|
\hline
|
|
\end{tabular}
|
|
\vfill
|
|
|
|
\hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Codebooks}
|
|
\begin{itemize}
|
|
\item There are many different ways to create a codebook
|
|
\item It can be a README, some other plain text file, a table (stored as CSV
|
|
or XLSX), a WORD document, or PDF
|
|
\item For a short questionnaire, it can be sufficient to export it as a PDF
|
|
\item Let's walk through a couple of options\dots
|
|
\end{itemize}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}{Option 1 -- Toy example with 11 questions}
|
|
{Simple PDF}
|
|
\begin{columns}
|
|
\begin{column}{.5\textwidth}
|
|
\begin{center}
|
|
\vspace{-.4cm}
|
|
Export from Qualtrics\\
|
|
\includegraphics[scale = .3]{../figures/codebook_1.png}
|
|
\end{center}
|
|
\end{column}
|
|
\begin{column}{.6\textwidth}
|
|
\begin{itemize}
|
|
\item For a simple questionnaire like this, the exported WORD document
|
|
from Qualtrics exported to PDF might be sufficient as a codebook
|
|
\item For longer questionnaires, the WORD document can still be a good
|
|
starting point to create a more elaborate codebook
|
|
\end{itemize}
|
|
\end{column}
|
|
\end{columns}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Option 2 -- Toy example with 11 questions}
|
|
{Plain text file}
|
|
\begin{center}
|
|
\vspace{-.3cm}
|
|
\footnotesize
|
|
\begin{lstlisting}[language = bash, identifierstyle=\color{iwmgray}]
|
|
sex. Please indicate your sex.
|
|
-------------------------------------------------------------------------------
|
|
-1. m
|
|
-2. f
|
|
-3. d
|
|
-4. not indicated
|
|
|
|
age. How old are you? Please enter your age in years.
|
|
-------------------------------------------------------------------------------
|
|
numerical input
|
|
|
|
data_sharing_1. Have you ever published data in a repository?
|
|
-------------------------------------------------------------------------------
|
|
-1. No
|
|
-2. Yes
|
|
\end{lstlisting}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Option 3 -- Toy example with 11 questions}
|
|
{Creating a simple codebook in R ``by hand''}
|
|
\footnotesize
|
|
\begin{lstlisting}
|
|
load("results/data_rdm-ms-ss2024_cleaned.RData")
|
|
codebook <- data.frame(var_name = names(dat),
|
|
var_text = c("Response Id", "Please indicate your sex.",
|
|
"How old are you? Please enter your age in years.",
|
|
...
|
|
"Sharing data is bad scientific practice",
|
|
"What is your current career level?",
|
|
"How long have you been working in science (in years)?"))
|
|
|
|
codebook$type <- sapply(dat, class)
|
|
codebook$n <- sapply(dat, length)
|
|
codebook$mean <- sapply(dat,
|
|
function(x) ifelse(is.numeric(x), mean(x, na.rm = TRUE), NA))
|
|
codebook$sd <- sapply(dat, function(x) ifelse(is.numeric(x), sd(x), NA))
|
|
|
|
openxlsx::write.xlsx(codebook, file = "codebook/codebook_01.xlsx")
|
|
\end{lstlisting}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Option 3 -- Toy example with 11 questions}
|
|
{Creating a simple codebook in R ``by hand''}
|
|
\begin{center}
|
|
\includegraphics[scale = .6]{../figures/codebook_2.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Option 4 -- Toy example with 11 questions}
|
|
{Using the codebook package in R}
|
|
\begin{itemize}
|
|
\item When you export a qualtrics questionnaire as SPSS file and import it
|
|
into R using the haven package, you can use RMarkdown to create an
|
|
elaborate HTML codebook
|
|
\item It works best for classical questionnaire items
|
|
\item In our example, the survey is not formatted well enough for the
|
|
generated codebook to be completely correct
|
|
\end{itemize}
|
|
\footnotesize
|
|
\begin{lstlisting}
|
|
#' ---
|
|
#' title: Codebook for Data Set "RDM MS SS 2024"
|
|
#' author: Nora Wickelmaier
|
|
#' ---
|
|
|
|
#+ echo = FALSE
|
|
dat <- haven::read_spss("../rawdata/RDM_MS_SS2024_download_2024-06-04.sav")
|
|
codebook::codebook(dat)
|
|
\end{lstlisting}
|
|
\end{frame}
|
|
|
|
\appendix
|
|
%%\begin{frame}[allowframebreaks]{References}
|
|
\begin{frame}{References}
|
|
%\renewcommand{\bibfont}{\small}
|
|
\printbibliography
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\end{document}
|
|
|