data_management/04_data_sharing/04_data_sharing.tex

516 lines
17 KiB
TeX

\documentclass[aspectratio=169]{beamer}
\usepackage{listings}
%\usepackage[utf8]{inputenc}
\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
\addbibresource{../literature/lit.bib}
\usepackage{fancyvrb}
\usepackage{fontawesome5} % get icons
\usepackage{multirow}
\usepackage{color, colortbl}
\usepackage{tikz}
\usetikzlibrary{fit}
\usepackage[edges]{forest}
\lstset{language=R,%
backgroundcolor=\color{iwmgray!15!white},
basicstyle=\ttfamily\color{iwmgray},
frame=none,
commentstyle=\slshape\color{iwmgreen},
keywordstyle=\bfseries\color{iwmgray},
identifierstyle=\color{iwmpurple},
stringstyle=\color{iwmblue},
numbers=none,%left,numberstyle=\tiny,
basewidth={.5em, .4em},
showstringspaces=false,
emphstyle=\color{red!50!white}}
\makeatletter \def\newblock{\beamer@newblock} \makeatother
\beamertemplatenavigationsymbolsempty
\setbeamertemplate{itemize items}[circle]
\setbeamertemplate{section in toc}[circle]
\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
\setbeamercolor{block body}{bg=iwmorange!50!white}
\setbeamercolor{block title}{fg=white, bg=iwmorange}
% Definitions for biblatex
\setbeamercolor{bibliography entry note}{fg=iwmgray}
\setbeamercolor{bibliography entry author}{fg=iwmgray}
\setbeamertemplate{bibliography item}{}
\definecolor{iwmorange}{RGB}{255,105,0}
\definecolor{iwmgray}{RGB}{67,79,79}
\definecolor{iwmblue}{RGB}{60,180,220}
\definecolor{iwmgreen}{RGB}{145,200,110}
\definecolor{iwmpurple}{RGB}{120,0,75}
\setbeamercolor{title}{fg=iwmorange}
\setbeamercolor{frametitle}{fg=iwmorange}
\setbeamercolor{structure}{fg=iwmorange}
\setbeamercolor{normal text}{fg=iwmgray}
\setbeamercolor{author}{fg=iwmgray}
\setbeamercolor{date}{fg=iwmgray}
\newcommand{\vect}[1]{\mathbf{#1}}
\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\gvect}[1]{\boldsymbol{#1}}
\newcommand{\gmat}[1]{\boldsymbol{#1}}
\AtBeginSection[]{
\frame{
\tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
\setbeamertemplate{headline}{
\begin{beamercolorbox}{section in head}
\vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
\end{beamercolorbox}
}
\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
\title{Data sharing}
\author{Nora Wickelmaier}
\date{June 24, 2024}
\begin{document}
\begin{frame}{}
\thispagestyle{empty}
\titlepage
\end{frame}
\begin{frame}{What are the benefits of sharing your data?}
% slido
\centering
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
\end{frame}
\begin{frame}[<+->]{Benefits of sharing data}
Sharing data
\begin{itemize}
\item[\dots] ensures that data are not ultimately lost (save data for posterity)
\item[\dots] is consistent with scientific norms of openness and rigor
\item[\dots] increases citation scores of papers
\item[\dots] encourages more research because it enables secondary analyses
\item[\dots] facilitates subsequent reanalyses (correct errors, emphasize
robustness of original results)
\item[\dots] is demanded by most third party funding agencies
\end{itemize}
\vfill
\hfill\tiny \citet{Wicherts2012}
\end{frame}
\begin{frame}{Agenda}
\centering
\begin{tabular}{ll}
\hline
Date & Topic \\
\hline
2024-05-13 & Introduction to data management \\
2024-05-27 & Workflow \\
2024-06-10 & Data organisation\\
\only<1>{2024-06-24}\only<2>{\bf 2024-06-24} & \only<1>{Data sharing}\only<2>{\bf Data sharing} \\
2024-07-08 & Clean coding \\
2024-07-22 & Version control \\
\hline
\end{tabular}
\end{frame}
% uploading under a license (CC-BY....)
% loading data on an archive, repository etc...
% Doing the archive
% mportant things before the open-access data
% Where to store data for long-term accessibility (conventions?)
% Tools, where I should upload my final data
% Upload data before or after publishing a paper? Time mangement
\section{Data organisation}
\begin{frame}[<+->]{What we covered so far}
\begin{itemize}
\item What habits do we need for effective research data management?
\item What is a workflow and why do we need one?
\item What needs to be considered when naming files of a research project?
\item How to organize folders for a research project?
\item What metadata should be added to my research project?
\item What are good ways to document a data set?
\end{itemize}
\end{frame}
\begin{frame}{Examples for documenting data sets}
\begin{enumerate}
\item A recent paper with published data by \citet{Ngo2023} investigating
what cues are considered by Twitter users to identify social bots
\item A multi-cohort, longitudinal study by the Hector Research Institute of
Education Sciences and Psychology at the university of Tübingen:
Transformation of the secondary school system and academic careers
\citep[TOSCA,][]{Koeller2004}
\item Editorial on why to publish your data with an accompanying data set
by \citet{Wicherts2012}
\end{enumerate}
\vfill
\end{frame}
\begin{frame}{\citet{Ngo2023}}
They provide
\begin{itemize}
\item A data set with 221 observations and 633 variables
\item A PDF with all measures and the scenario used for collecting the data
\end{itemize}
\vspace{.3cm}
\begin{block}{Exercise}
\begin{itemize}
\item Go to \url{https://osf.io/6y3nk/} and download the files
\texttt{data.csv} and
\texttt{Experimental-Study-Measures and scenario.pdf}
\item Read the data into R using \texttt{read.csv()}
\item Find out which variables in the data correspond to measure
``(9)~Demographics''
\end{itemize}
\end{block}
\vspace{.3cm}
\pause
(BTW: Sharing the data in this form is better than \emph{not} sharing them,
in my opinion)
\end{frame}
\begin{frame}{What additional information do we need to use these data?}
% slido
\centering
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
\end{frame}
\begin{frame}{TOSCA}
\begin{itemize}
\item Multi-cohort study that includes longitudinal data for several cohorts
\item Broad spectrum of achievement test data and psycho-social variables
\item Large number of publications on different topics using these data
\item This is not the original data set, but a prepared version for teaching
statistics (hence, proportions in the data and the codebook are not
identical)
\end{itemize}
\begin{block}{Exercise}
\begin{itemize}
\item Read the data set \texttt{TOSCAtoTeach\_W123.sav} into R using
\texttt{foreign::read.spss()} or \texttt{haven::read\_spss()}
\item Create contingency tables for the variables \texttt{sform} and
\texttt{szweig1} and compare the results to the codebook
\texttt{Skalenhandbuch\_TOSCAtoTeachW123.pdf}
\end{itemize}
\end{block}
\hfill{\tiny \url{https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/research/current-studies/tosca}}
\end{frame}
\begin{frame}{\citet{Wicherts2012}}
They provide
\begin{itemize}
\item A data set with 537 observations and 79 variables
(\texttt{1-s2.0-S0160289612000050-mmc2.xls})
\item A codebook with variable names and some descriptive statistics for
the scales (\texttt{1-s2.0-S0160289612000050-mmc1.doc})
\item ``Publish (your data) or (let the data) perish! Why not publish your
data too?''
\item Data come from freshman-testing program called ``Testweek''
\item (Try \texttt{readxl::read\_excel()} to read the data into R)
\end{itemize}
\vfill
\end{frame}
\begin{frame}{What is the single one thing that would make sharing these data
indefinitely better?}
% slido
\centering
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
\end{frame}
\begin{frame}[<+->]{Non-anonymous data}
\begin{itemize}
\item Before putting data into any cloud, you should always take a moment to
reflect if your data are anonymous
\item No (third-party) cloud storage, even if it is not publicly accessible
\item If your data contains personal data, it should always be stored
locally, ideally on an encrypted device
\item You should have a plan --- bofore ever collecting the data --- how,
when, and by whom the data will be anonymized
\item All data should eventually be anonymized! (Yes, even audio and video
data)
\item IWM servers can be considered local
\end{itemize}
\end{frame}
\section[Collaborative use]{Sharing data for collaborative use}
\begin{frame}[<+->]{Working together with the same data}
\begin{itemize}
\item Part of data organisation is to think about who needs access to
your data
\item Often these are colleagues from the same lab and there is
infrastructure to share files and scripts easily
\item The IWM offers several solutions for sharing your data (internally and
externally)
\item When the end goal is to make the data public, it might be a good idea
to work together at a place where the data can go public at a certain
point in time
\item We will look at two possiblities: OSF and Github
\end{itemize}
\vfill
\end{frame}
\begin{frame}{IWM solutions}
IWM servers
\begin{itemize}
\item Nextcloud: \url{https://nextcloud.iwm-tuebingen.de/}
\item Gitea: \url{https://gitea.iwm-tuebingen.de/}
\item Shared drive: \texttt{Y:/}
\end{itemize}
\vspace{.4cm}
Microsoft servers
\begin{itemize}
\item OneDrive
\item Teams
\end{itemize}
\vfill
\pause
(Maybe check out the three tips of the week on this topic:
{\tiny
\url{https://iwmonline.sharepoint.com/sites/intranet/SitePages/direktorat/en/Interne-Kommunikation.aspx\#tip-of-the-week-tutorial-series}})
\end{frame}
\begin{frame}{Open Science Framework}
{\url{https://osf.io/}}
\begin{columns}
\begin{column}{.4\textwidth}
\begin{itemize}
\item ``OSF is a free and open source project management tool that supports
researchers throughout their entire project lifecycle.''
\item Founded in 2012 and constantly developed: \url{https://www.cos.io/timeline}
\item Meant to integrate all research steps
\end{itemize}
\end{column}
\begin{column}{.7\textwidth}
\includegraphics[scale = .2]{../figures/osf_workflow.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[fragile]{Let's try it out}
\begin{tikzpicture}[
every node/.style = {text width = 5.1cm, align = left},
every path/.style = {thick, draw}
]
\node (ex) at (0, 0) {\faIcon{folder} \verb+toyexample+};
\node (n1) at (5, 0) {\faIcon{folder} \verb+code+};
\node (n2) at (5, -1.4) {\faIcon{folder} \verb+data+};
\node (n3) at (5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
\path (ex.center) -- (n1.west);
\path (ex.center) -- (n2.west);
\path (ex.center) -- (n3.west);
\node (o1a) at (10, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
\node (o1b) at (10, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
\node (o2) at (10, -1.4) {\faIcon{folder} \verb+processed+};
\node (o3) at (10, -2.1) {\faIcon{folder} \verb+rawdata+};
\node (o4) at (10, -2.8) {\faIcon[regular]{file} \verb+codebook.pdf+};
\path (n1.center) -- (o1a.west);
\path (n1.center) -- (o1b.west);
\path (n2.center) -- (o2.west);
\path (n2.center) -- (o3.west);
\path (n2.center) -- (o4.west);
\end{tikzpicture}
Steps
\begin{enumerate}
\item You need an OSF account -- just sign up with an e-mail address or use ORCID
\item Sign in
\item Create a project
\item Upload (or link) your files
\item Invite contributors
\end{enumerate}
\end{frame}
% TODO:
% Show different cases on OSF:
% 1. OSF with handmade codebook, all in one folder
% 2. OSF with different components (show that they can all have different
% licenses)
% 3. OSF with Github integrated
% Show selection of servers (GDPR)
\begin{frame}{Licenses}
\begin{columns}
\begin{column}{.3\textwidth}
\includegraphics[scale = .4]{../figures/licenses_osf.png}
\end{column}
\begin{column}{.7\textwidth}
\begin{itemize}
\item OSF offers you several options for licenses
\item For data the Creative Common (CC) licenses are usually a good option
\item For software, other options might be better suited
\item For code (e.\,g., analysis scripts) CC licenses are also a good
choice
\end{itemize}
\vspace{1cm}
\hfill{\footnotesize \url{https://creativecommons.org/}}\\
\hfill{\footnotesize \url{https://help.osf.io/article/288-license-your-project}}\\
\hfill{\footnotesize \url{https://choosealicense.com/}}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Github}
{\url{https://github.com/}}
\begin{columns}
\begin{column}{.8\textwidth}
\begin{itemize}
\item Developer platform that allows developers to create, store, manage and
share code
\item Based on Git software providing version control
\begin{itemize}
\item[+] access control
\item[+] bug tracking
\item[+] software feature requests
\item[+] task management
\item[+] continuous integration
\item[+] wikis
\end{itemize}
\item Commonly used to host open source software development projects
\item Bought by Microsoft in 2018
\end{itemize}
\end{column}
\begin{column}{.3\textwidth}
\includegraphics[scale = .2]{../figures/github.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Github workflow}
\begin{center}
\includegraphics[scale = .3]{../figures/workflow_git-github.png}
\end{center}
\hfill{\tiny \url{https://carpentries-incubator.github.io/open-science-with-r/09-collaborating}}
\end{frame}
% TODO:
% READMEs:
% https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes
\section[Repositories]{Sharing data in repositories}
\begin{frame}[allowframebreaks]{Data publication}
Which data should I share?
\begin{itemize}
\item In general, all data that are used in publications
\item Data for your dissertation
\item Data that you collected but know that you will never come around to
analyzing
\end{itemize}
\vspace{.3cm}
What are the reasons to share data?
\begin{itemize}
\item Transparency
\item Data safety
\item Cumulative research process
\item Visibility
\end{itemize}
\framebreak
For whom are you sharing data?
\begin{itemize}
\item Yourself
\item Reviewers
\item People who read your papers
\item Other scientists
\item Colleagues and collaboraters
\end{itemize}
\vspace{.3cm}
How should you share your data?
\begin{itemize}
\item On a public platform (or website), i.\,e., no account needed if
possible
\item Together with a codebook or at least an informative README
\end{itemize}
\end{frame}
\begin{frame}{Data repositories (suggested in our Research Data Policy)}
National
\begin{itemize}
\item \url{https://www.psycharchives.org/}
\item \url{https://www.forschungsdaten-bildung.de/}
\item \url{https://datorium.gesis.org/}
\item \url{https://www.iqb.hu-berlin.de/fdz}
\end{itemize}
\vspace{.4cm}
International
\begin{itemize}
\item \url{https://datadryad.org/}
\item \url{https://osf.io/}
\item \url{https://zenodo.org/}
\end{itemize}
\vfill
\hfill{\footnotesize \url{https://datamanagement.hms.harvard.edu/share-publish/data-repositories}}
\end{frame}
\begin{frame}[<+->]{Zenodo}{https://zenodo.org/}
\begin{itemize}
\item General-purpose open repository launched in 2015
\item Financed by the EU (European OpenAIRE program)
\item Operated by CERN
\item All disciplines
\item Suitable for
\begin{itemize}
\item Data sets
\item Papers / Preprints
\item Research software
\item Reports
\item Any other digital research objects
\end{itemize}
\item Upload up to 50 GB possible
\item Easily citable since all objects get DOI
\item Open source code is available on Github
\item IWM example: \url{https://doi.org/10.5281/zenodo.2532411}
\end{itemize}
\end{frame}
\begin{frame}[<+->]{PsychArchives}{https://psycharchives.org/}
\begin{itemize}
\item Disciplinary repository for psychological science (and neighboring
disciplines)
\item Developed and operated by ZPID (Leibniz-Institut für Psycholgie)
\item Accommodating 20 different digital research object (DRO) types
\begin{itemize}
\item Articles
\item Preprints
\item Research data
\item Code
\item Supplements
\item Preregistrations
\item \dots
\end{itemize}
\item Searchable by ``IWM'': \url{https://psycharchives.org/en/browse/?q=iwm}
\item Easily citable since all objects get DOI
\item Different objects can be linked together (e.\,g., data und code)
\end{itemize}
\end{frame}
\appendix
%%\begin{frame}[allowframebreaks]{References}
\begin{frame}{References}
%\renewcommand{\bibfont}{\small}
\printbibliography
\vfill
\end{frame}
\end{document}