516 lines
17 KiB
TeX
516 lines
17 KiB
TeX
\documentclass[aspectratio=169]{beamer}
|
|
|
|
\usepackage{listings}
|
|
%\usepackage[utf8]{inputenc}
|
|
\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
|
|
\addbibresource{../literature/lit.bib}
|
|
|
|
\usepackage{fancyvrb}
|
|
\usepackage{fontawesome5} % get icons
|
|
\usepackage{multirow}
|
|
\usepackage{color, colortbl}
|
|
|
|
\usepackage{tikz}
|
|
\usetikzlibrary{fit}
|
|
\usepackage[edges]{forest}
|
|
|
|
\lstset{language=R,%
|
|
backgroundcolor=\color{iwmgray!15!white},
|
|
basicstyle=\ttfamily\color{iwmgray},
|
|
frame=none,
|
|
commentstyle=\slshape\color{iwmgreen},
|
|
keywordstyle=\bfseries\color{iwmgray},
|
|
identifierstyle=\color{iwmpurple},
|
|
stringstyle=\color{iwmblue},
|
|
numbers=none,%left,numberstyle=\tiny,
|
|
basewidth={.5em, .4em},
|
|
showstringspaces=false,
|
|
emphstyle=\color{red!50!white}}
|
|
|
|
\makeatletter \def\newblock{\beamer@newblock} \makeatother
|
|
|
|
\beamertemplatenavigationsymbolsempty
|
|
\setbeamertemplate{itemize items}[circle]
|
|
\setbeamertemplate{section in toc}[circle]
|
|
\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
|
|
\setbeamercolor{block body}{bg=iwmorange!50!white}
|
|
\setbeamercolor{block title}{fg=white, bg=iwmorange}
|
|
% Definitions for biblatex
|
|
\setbeamercolor{bibliography entry note}{fg=iwmgray}
|
|
\setbeamercolor{bibliography entry author}{fg=iwmgray}
|
|
\setbeamertemplate{bibliography item}{}
|
|
|
|
\definecolor{iwmorange}{RGB}{255,105,0}
|
|
\definecolor{iwmgray}{RGB}{67,79,79}
|
|
\definecolor{iwmblue}{RGB}{60,180,220}
|
|
\definecolor{iwmgreen}{RGB}{145,200,110}
|
|
\definecolor{iwmpurple}{RGB}{120,0,75}
|
|
|
|
\setbeamercolor{title}{fg=iwmorange}
|
|
\setbeamercolor{frametitle}{fg=iwmorange}
|
|
\setbeamercolor{structure}{fg=iwmorange}
|
|
\setbeamercolor{normal text}{fg=iwmgray}
|
|
\setbeamercolor{author}{fg=iwmgray}
|
|
\setbeamercolor{date}{fg=iwmgray}
|
|
|
|
\newcommand{\vect}[1]{\mathbf{#1}}
|
|
\newcommand{\mat}[1]{\mathbf{#1}}
|
|
\newcommand{\gvect}[1]{\boldsymbol{#1}}
|
|
\newcommand{\gmat}[1]{\boldsymbol{#1}}
|
|
|
|
\AtBeginSection[]{
|
|
\frame{
|
|
\tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
|
|
|
|
\setbeamertemplate{headline}{
|
|
\begin{beamercolorbox}{section in head}
|
|
\vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
|
|
\end{beamercolorbox}
|
|
}
|
|
|
|
\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
|
|
|
|
\title{Data sharing}
|
|
\author{Nora Wickelmaier}
|
|
\date{June 24, 2024}
|
|
|
|
\begin{document}
|
|
|
|
\begin{frame}{}
|
|
\thispagestyle{empty}
|
|
\titlepage
|
|
\end{frame}
|
|
|
|
\begin{frame}{What are the benefits of sharing your data?}
|
|
% slido
|
|
\centering
|
|
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
|
|
|
|
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
|
|
\end{frame}
|
|
|
|
\begin{frame}[<+->]{Benefits of sharing data}
|
|
Sharing data
|
|
\begin{itemize}
|
|
\item[\dots] ensures that data are not ultimately lost (save data for posterity)
|
|
\item[\dots] is consistent with scientific norms of openness and rigor
|
|
\item[\dots] increases citation scores of papers
|
|
\item[\dots] encourages more research because it enables secondary analyses
|
|
\item[\dots] facilitates subsequent reanalyses (correct errors, emphasize
|
|
robustness of original results)
|
|
\item[\dots] is demanded by most third party funding agencies
|
|
\end{itemize}
|
|
\vfill
|
|
\hfill\tiny \citet{Wicherts2012}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Agenda}
|
|
\centering
|
|
\begin{tabular}{ll}
|
|
\hline
|
|
Date & Topic \\
|
|
\hline
|
|
2024-05-13 & Introduction to data management \\
|
|
2024-05-27 & Workflow \\
|
|
2024-06-10 & Data organisation\\
|
|
\only<1>{2024-06-24}\only<2>{\bf 2024-06-24} & \only<1>{Data sharing}\only<2>{\bf Data sharing} \\
|
|
2024-07-08 & Clean coding \\
|
|
2024-07-22 & Version control \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
% uploading under a license (CC-BY....)
|
|
% loading data on an archive, repository etc...
|
|
% Doing the archive
|
|
% mportant things before the open-access data
|
|
% Where to store data for long-term accessibility (conventions?)
|
|
% Tools, where I should upload my final data
|
|
% Upload data before or after publishing a paper? Time mangement
|
|
|
|
\section{Data organisation}
|
|
|
|
\begin{frame}[<+->]{What we covered so far}
|
|
\begin{itemize}
|
|
\item What habits do we need for effective research data management?
|
|
\item What is a workflow and why do we need one?
|
|
\item What needs to be considered when naming files of a research project?
|
|
\item How to organize folders for a research project?
|
|
\item What metadata should be added to my research project?
|
|
\item What are good ways to document a data set?
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Examples for documenting data sets}
|
|
\begin{enumerate}
|
|
\item A recent paper with published data by \citet{Ngo2023} investigating
|
|
what cues are considered by Twitter users to identify social bots
|
|
\item A multi-cohort, longitudinal study by the Hector Research Institute of
|
|
Education Sciences and Psychology at the university of Tübingen:
|
|
Transformation of the secondary school system and academic careers
|
|
\citep[TOSCA,][]{Koeller2004}
|
|
\item Editorial on why to publish your data with an accompanying data set
|
|
by \citet{Wicherts2012}
|
|
\end{enumerate}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}{\citet{Ngo2023}}
|
|
They provide
|
|
\begin{itemize}
|
|
\item A data set with 221 observations and 633 variables
|
|
\item A PDF with all measures and the scenario used for collecting the data
|
|
\end{itemize}
|
|
\vspace{.3cm}
|
|
\begin{block}{Exercise}
|
|
\begin{itemize}
|
|
\item Go to \url{https://osf.io/6y3nk/} and download the files
|
|
\texttt{data.csv} and
|
|
\texttt{Experimental-Study-Measures and scenario.pdf}
|
|
\item Read the data into R using \texttt{read.csv()}
|
|
\item Find out which variables in the data correspond to measure
|
|
``(9)~Demographics''
|
|
\end{itemize}
|
|
\end{block}
|
|
\vspace{.3cm}
|
|
\pause
|
|
(BTW: Sharing the data in this form is better than \emph{not} sharing them,
|
|
in my opinion)
|
|
\end{frame}
|
|
|
|
\begin{frame}{What additional information do we need to use these data?}
|
|
% slido
|
|
\centering
|
|
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
|
|
|
|
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
|
|
\end{frame}
|
|
|
|
\begin{frame}{TOSCA}
|
|
\begin{itemize}
|
|
\item Multi-cohort study that includes longitudinal data for several cohorts
|
|
\item Broad spectrum of achievement test data and psycho-social variables
|
|
\item Large number of publications on different topics using these data
|
|
\item This is not the original data set, but a prepared version for teaching
|
|
statistics (hence, proportions in the data and the codebook are not
|
|
identical)
|
|
\end{itemize}
|
|
\begin{block}{Exercise}
|
|
\begin{itemize}
|
|
\item Read the data set \texttt{TOSCAtoTeach\_W123.sav} into R using
|
|
\texttt{foreign::read.spss()} or \texttt{haven::read\_spss()}
|
|
\item Create contingency tables for the variables \texttt{sform} and
|
|
\texttt{szweig1} and compare the results to the codebook
|
|
\texttt{Skalenhandbuch\_TOSCAtoTeachW123.pdf}
|
|
\end{itemize}
|
|
\end{block}
|
|
\hfill{\tiny \url{https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/research/current-studies/tosca}}
|
|
\end{frame}
|
|
|
|
\begin{frame}{\citet{Wicherts2012}}
|
|
They provide
|
|
\begin{itemize}
|
|
\item A data set with 537 observations and 79 variables
|
|
(\texttt{1-s2.0-S0160289612000050-mmc2.xls})
|
|
\item A codebook with variable names and some descriptive statistics for
|
|
the scales (\texttt{1-s2.0-S0160289612000050-mmc1.doc})
|
|
\item ``Publish (your data) or (let the data) perish! Why not publish your
|
|
data too?''
|
|
\item Data come from freshman-testing program called ``Testweek''
|
|
\item (Try \texttt{readxl::read\_excel()} to read the data into R)
|
|
\end{itemize}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}{What is the single one thing that would make sharing these data
|
|
indefinitely better?}
|
|
% slido
|
|
\centering
|
|
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
|
|
|
|
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
|
|
\end{frame}
|
|
|
|
\begin{frame}[<+->]{Non-anonymous data}
|
|
\begin{itemize}
|
|
\item Before putting data into any cloud, you should always take a moment to
|
|
reflect if your data are anonymous
|
|
\item No (third-party) cloud storage, even if it is not publicly accessible
|
|
\item If your data contains personal data, it should always be stored
|
|
locally, ideally on an encrypted device
|
|
\item You should have a plan --- bofore ever collecting the data --- how,
|
|
when, and by whom the data will be anonymized
|
|
\item All data should eventually be anonymized! (Yes, even audio and video
|
|
data)
|
|
\item IWM servers can be considered local
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\section[Collaborative use]{Sharing data for collaborative use}
|
|
|
|
\begin{frame}[<+->]{Working together with the same data}
|
|
\begin{itemize}
|
|
\item Part of data organisation is to think about who needs access to
|
|
your data
|
|
\item Often these are colleagues from the same lab and there is
|
|
infrastructure to share files and scripts easily
|
|
\item The IWM offers several solutions for sharing your data (internally and
|
|
externally)
|
|
\item When the end goal is to make the data public, it might be a good idea
|
|
to work together at a place where the data can go public at a certain
|
|
point in time
|
|
\item We will look at two possiblities: OSF and Github
|
|
\end{itemize}
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\begin{frame}{IWM solutions}
|
|
IWM servers
|
|
\begin{itemize}
|
|
\item Nextcloud: \url{https://nextcloud.iwm-tuebingen.de/}
|
|
\item Gitea: \url{https://gitea.iwm-tuebingen.de/}
|
|
\item Shared drive: \texttt{Y:/}
|
|
\end{itemize}
|
|
\vspace{.4cm}
|
|
Microsoft servers
|
|
\begin{itemize}
|
|
\item OneDrive
|
|
\item Teams
|
|
\end{itemize}
|
|
\vfill
|
|
\pause
|
|
(Maybe check out the three tips of the week on this topic:
|
|
{\tiny
|
|
\url{https://iwmonline.sharepoint.com/sites/intranet/SitePages/direktorat/en/Interne-Kommunikation.aspx\#tip-of-the-week-tutorial-series}})
|
|
\end{frame}
|
|
|
|
\begin{frame}{Open Science Framework}
|
|
{\url{https://osf.io/}}
|
|
\begin{columns}
|
|
\begin{column}{.4\textwidth}
|
|
\begin{itemize}
|
|
\item ``OSF is a free and open source project management tool that supports
|
|
researchers throughout their entire project lifecycle.''
|
|
\item Founded in 2012 and constantly developed: \url{https://www.cos.io/timeline}
|
|
\item Meant to integrate all research steps
|
|
\end{itemize}
|
|
\end{column}
|
|
\begin{column}{.7\textwidth}
|
|
\includegraphics[scale = .2]{../figures/osf_workflow.png}
|
|
\end{column}
|
|
\end{columns}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]{Let's try it out}
|
|
\begin{tikzpicture}[
|
|
every node/.style = {text width = 5.1cm, align = left},
|
|
every path/.style = {thick, draw}
|
|
]
|
|
\node (ex) at (0, 0) {\faIcon{folder} \verb+toyexample+};
|
|
\node (n1) at (5, 0) {\faIcon{folder} \verb+code+};
|
|
\node (n2) at (5, -1.4) {\faIcon{folder} \verb+data+};
|
|
\node (n3) at (5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
|
|
\path (ex.center) -- (n1.west);
|
|
\path (ex.center) -- (n2.west);
|
|
\path (ex.center) -- (n3.west);
|
|
|
|
\node (o1a) at (10, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
|
|
\node (o1b) at (10, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
|
|
\node (o2) at (10, -1.4) {\faIcon{folder} \verb+processed+};
|
|
\node (o3) at (10, -2.1) {\faIcon{folder} \verb+rawdata+};
|
|
\node (o4) at (10, -2.8) {\faIcon[regular]{file} \verb+codebook.pdf+};
|
|
\path (n1.center) -- (o1a.west);
|
|
\path (n1.center) -- (o1b.west);
|
|
\path (n2.center) -- (o2.west);
|
|
\path (n2.center) -- (o3.west);
|
|
\path (n2.center) -- (o4.west);
|
|
\end{tikzpicture}
|
|
Steps
|
|
\begin{enumerate}
|
|
\item You need an OSF account -- just sign up with an e-mail address or use ORCID
|
|
\item Sign in
|
|
\item Create a project
|
|
\item Upload (or link) your files
|
|
\item Invite contributors
|
|
\end{enumerate}
|
|
\end{frame}
|
|
|
|
% TODO:
|
|
|
|
% Show different cases on OSF:
|
|
% 1. OSF with handmade codebook, all in one folder
|
|
% 2. OSF with different components (show that they can all have different
|
|
% licenses)
|
|
% 3. OSF with Github integrated
|
|
|
|
% Show selection of servers (GDPR)
|
|
|
|
\begin{frame}{Licenses}
|
|
\begin{columns}
|
|
\begin{column}{.3\textwidth}
|
|
\includegraphics[scale = .4]{../figures/licenses_osf.png}
|
|
\end{column}
|
|
\begin{column}{.7\textwidth}
|
|
\begin{itemize}
|
|
\item OSF offers you several options for licenses
|
|
\item For data the Creative Common (CC) licenses are usually a good option
|
|
\item For software, other options might be better suited
|
|
\item For code (e.\,g., analysis scripts) CC licenses are also a good
|
|
choice
|
|
\end{itemize}
|
|
\vspace{1cm}
|
|
|
|
\hfill{\footnotesize \url{https://creativecommons.org/}}\\
|
|
\hfill{\footnotesize \url{https://help.osf.io/article/288-license-your-project}}\\
|
|
\hfill{\footnotesize \url{https://choosealicense.com/}}
|
|
\end{column}
|
|
\end{columns}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Github}
|
|
{\url{https://github.com/}}
|
|
\begin{columns}
|
|
\begin{column}{.8\textwidth}
|
|
\begin{itemize}
|
|
\item Developer platform that allows developers to create, store, manage and
|
|
share code
|
|
\item Based on Git software providing version control
|
|
\begin{itemize}
|
|
\item[+] access control
|
|
\item[+] bug tracking
|
|
\item[+] software feature requests
|
|
\item[+] task management
|
|
\item[+] continuous integration
|
|
\item[+] wikis
|
|
\end{itemize}
|
|
\item Commonly used to host open source software development projects
|
|
\item Bought by Microsoft in 2018
|
|
\end{itemize}
|
|
\end{column}
|
|
\begin{column}{.3\textwidth}
|
|
\includegraphics[scale = .2]{../figures/github.png}
|
|
\end{column}
|
|
\end{columns}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Github workflow}
|
|
\begin{center}
|
|
\includegraphics[scale = .3]{../figures/workflow_git-github.png}
|
|
\end{center}
|
|
\hfill{\tiny \url{https://carpentries-incubator.github.io/open-science-with-r/09-collaborating}}
|
|
\end{frame}
|
|
|
|
% TODO:
|
|
|
|
% READMEs:
|
|
% https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes
|
|
|
|
\section[Repositories]{Sharing data in repositories}
|
|
|
|
\begin{frame}[allowframebreaks]{Data publication}
|
|
Which data should I share?
|
|
\begin{itemize}
|
|
\item In general, all data that are used in publications
|
|
\item Data for your dissertation
|
|
\item Data that you collected but know that you will never come around to
|
|
analyzing
|
|
\end{itemize}
|
|
\vspace{.3cm}
|
|
What are the reasons to share data?
|
|
\begin{itemize}
|
|
\item Transparency
|
|
\item Data safety
|
|
\item Cumulative research process
|
|
\item Visibility
|
|
\end{itemize}
|
|
\framebreak
|
|
For whom are you sharing data?
|
|
\begin{itemize}
|
|
\item Yourself
|
|
\item Reviewers
|
|
\item People who read your papers
|
|
\item Other scientists
|
|
\item Colleagues and collaboraters
|
|
\end{itemize}
|
|
\vspace{.3cm}
|
|
How should you share your data?
|
|
\begin{itemize}
|
|
\item On a public platform (or website), i.\,e., no account needed if
|
|
possible
|
|
\item Together with a codebook or at least an informative README
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Data repositories (suggested in our Research Data Policy)}
|
|
National
|
|
\begin{itemize}
|
|
\item \url{https://www.psycharchives.org/}
|
|
\item \url{https://www.forschungsdaten-bildung.de/}
|
|
\item \url{https://datorium.gesis.org/}
|
|
\item \url{https://www.iqb.hu-berlin.de/fdz}
|
|
\end{itemize}
|
|
\vspace{.4cm}
|
|
International
|
|
\begin{itemize}
|
|
\item \url{https://datadryad.org/}
|
|
\item \url{https://osf.io/}
|
|
\item \url{https://zenodo.org/}
|
|
\end{itemize}
|
|
\vfill
|
|
|
|
\hfill{\footnotesize \url{https://datamanagement.hms.harvard.edu/share-publish/data-repositories}}
|
|
\end{frame}
|
|
|
|
\begin{frame}[<+->]{Zenodo}{https://zenodo.org/}
|
|
\begin{itemize}
|
|
\item General-purpose open repository launched in 2015
|
|
\item Financed by the EU (European OpenAIRE program)
|
|
\item Operated by CERN
|
|
\item All disciplines
|
|
\item Suitable for
|
|
\begin{itemize}
|
|
\item Data sets
|
|
\item Papers / Preprints
|
|
\item Research software
|
|
\item Reports
|
|
\item Any other digital research objects
|
|
\end{itemize}
|
|
\item Upload up to 50 GB possible
|
|
\item Easily citable since all objects get DOI
|
|
\item Open source code is available on Github
|
|
\item IWM example: \url{https://doi.org/10.5281/zenodo.2532411}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[<+->]{PsychArchives}{https://psycharchives.org/}
|
|
\begin{itemize}
|
|
\item Disciplinary repository for psychological science (and neighboring
|
|
disciplines)
|
|
\item Developed and operated by ZPID (Leibniz-Institut für Psycholgie)
|
|
\item Accommodating 20 different digital research object (DRO) types
|
|
\begin{itemize}
|
|
\item Articles
|
|
\item Preprints
|
|
\item Research data
|
|
\item Code
|
|
\item Supplements
|
|
\item Preregistrations
|
|
\item \dots
|
|
\end{itemize}
|
|
\item Searchable by ``IWM'': \url{https://psycharchives.org/en/browse/?q=iwm}
|
|
\item Easily citable since all objects get DOI
|
|
\item Different objects can be linked together (e.\,g., data und code)
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\appendix
|
|
%%\begin{frame}[allowframebreaks]{References}
|
|
\begin{frame}{References}
|
|
%\renewcommand{\bibfont}{\small}
|
|
\printbibliography
|
|
\vfill
|
|
\end{frame}
|
|
|
|
\end{document}
|
|
|