Slides for fourth session

2024-06-23 14:43:31 +02:00 · 2024-06-23 14:43:31 +02:00 · 5cdb5fdb8f
commit 5cdb5fdb8f
parent e723230ca5
9 changed files with 496 additions and 1 deletions
--- a/03_data_organisation/example/README.md
+++ b/03_data_organisation/example/README.md
@ -28,7 +28,7 @@ numbered, indicating the order they should be executed in.
 The `data` folder contains all folders associated with data and its
 documentation.

-The `code` folder with contains different codebook options and R scripts that
+The `codebook` folder contains different codebook options and R scripts that
 create these codebooks. If the codebook is created by an R script, the script
 and the codebook are named identically, e.g., `codebook_01.R` and
 `codebook_01.xslx`.
--- a/04_data_sharing/04_data_sharing.tex
+++ b/04_data_sharing/04_data_sharing.tex
@ -0,0 +1,460 @@
+\documentclass[aspectratio=169]{beamer}
+
+\usepackage{listings}
+%\usepackage[utf8]{inputenc}
+\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
+\addbibresource{../literature/lit.bib}
+
+\usepackage{fancyvrb}
+\usepackage{fontawesome5}                % get icons
+\usepackage{multirow}
+\usepackage{color, colortbl}
+
+\usepackage{tikz}
+\usetikzlibrary{fit}
+\usepackage[edges]{forest}
+
+\lstset{language=R,%
+  backgroundcolor=\color{iwmgray!15!white},
+  basicstyle=\ttfamily\color{iwmgray},
+  frame=none,
+  commentstyle=\slshape\color{iwmgreen},
+  keywordstyle=\bfseries\color{iwmgray},
+  identifierstyle=\color{iwmpurple},
+  stringstyle=\color{iwmblue},
+  numbers=none,%left,numberstyle=\tiny,
+  basewidth={.5em, .4em},
+  showstringspaces=false,
+  emphstyle=\color{red!50!white}}
+
+\makeatletter \def\newblock{\beamer@newblock} \makeatother
+
+\beamertemplatenavigationsymbolsempty
+\setbeamertemplate{itemize items}[circle]
+\setbeamertemplate{section in toc}[circle]
+\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
+\setbeamercolor{block body}{bg=iwmorange!50!white}
+\setbeamercolor{block title}{fg=white, bg=iwmorange}
+% Definitions for biblatex
+\setbeamercolor{bibliography entry note}{fg=iwmgray}
+\setbeamercolor{bibliography entry author}{fg=iwmgray}
+\setbeamertemplate{bibliography item}{}
+
+\definecolor{iwmorange}{RGB}{255,105,0}
+\definecolor{iwmgray}{RGB}{67,79,79}
+\definecolor{iwmblue}{RGB}{60,180,220}
+\definecolor{iwmgreen}{RGB}{145,200,110}
+\definecolor{iwmpurple}{RGB}{120,0,75}
+
+\setbeamercolor{title}{fg=iwmorange}
+\setbeamercolor{frametitle}{fg=iwmorange}
+\setbeamercolor{structure}{fg=iwmorange}
+\setbeamercolor{normal text}{fg=iwmgray}
+\setbeamercolor{author}{fg=iwmgray}
+\setbeamercolor{date}{fg=iwmgray}
+
+\newcommand{\vect}[1]{\mathbf{#1}}
+\newcommand{\mat}[1]{\mathbf{#1}}
+\newcommand{\gvect}[1]{\boldsymbol{#1}}
+\newcommand{\gmat}[1]{\boldsymbol{#1}}
+
+\AtBeginSection[]{
+  \frame{
+    \tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
+
+\setbeamertemplate{headline}{
+ \begin{beamercolorbox}{section in head}
+   \vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
+ \end{beamercolorbox}
+}
+
+\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
+
+\title{Data sharing}
+\author{Nora Wickelmaier}
+\date{June 24, 2024}
+
+\begin{document}
+
+\begin{frame}{}
+\thispagestyle{empty}
+\titlepage
+\end{frame}
+
+\begin{frame}{What are the benefits of sharing your data?}
+  % slido
+  \centering
+  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
+
+  \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
+\end{frame}
+
+\begin{frame}[<+->]{Benefits of sharing data}
+  Sharing data
+  \begin{itemize}
+    \item[\dots] ensures that data are not ultimately lost (save data for posterity)
+    \item[\dots] is consistent with scientific norms of openness and rigor
+    \item[\dots] increases citation scores of papers
+    \item[\dots] encourages more research because it enables secondary analyses
+    \item[\dots] facilitates subsequent reanalyses (correct errors, emphasize
+      robustness of original results)
+    \item[\dots] is demanded by most third party funding agencies
+  \end{itemize}
+  \vfill
+  \hfill\tiny \citet{Wicherts2012}
+\end{frame}
+
+\begin{frame}{Agenda}
+\centering
+\begin{tabular}{ll}
+\hline
+Date & Topic \\
+\hline
+2024-05-13 & Introduction to data management \\
+2024-05-27 & Workflow \\
+2024-06-10 & Data organisation\\
+\only<1>{2024-06-24}\only<2>{\bf 2024-06-24} & \only<1>{Data sharing}\only<2>{\bf Data sharing}                    \\
+2024-07-08 & Clean coding                    \\
+2024-07-22 & Version control                 \\
+\hline
+\end{tabular}
+\end{frame}
+
+% uploading under a license (CC-BY....) 
+% loading data on an archive, repository etc... 
+% Doing the archive
+% mportant things before the open-access data
+% Where to store data for long-term accessibility (conventions?)
+% Tools, where I should upload my final data
+% Upload data before or after publishing a paper? Time mangement
+
+\section{Data organisation}
+
+\begin{frame}[<+->]{What we covered so far}
+  \begin{itemize}
+    \item What habits do we need for effective research data management?
+    \item What is a workflow and why do we need one?
+    \item What needs to be considered when naming files of a research project?
+    \item How to organize folders for a research project?
+    \item What metadata should be added to my research project?
+    \item What are good ways to document a data set?
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Examples for documenting data sets}
+  \begin{enumerate}
+    \item A recent paper with published data by \citet{Ngo2023} investigating
+      what cues are considered by Twitter users to identify social bots
+    \item A multi-cohort, longitudinal study by the Hector Research Institute of
+      Education Sciences and Psychology at the university of Tübingen:
+      Transformation of the secondary school system and academic careers
+      \citep[TOSCA,][]{Koeller2004}
+    \item Editorial on why to publish your data with an accompanying data set
+      by \citet{Wicherts2012}
+  \end{enumerate}
+  \vfill
+\end{frame}
+
+\begin{frame}{\citet{Ngo2023}}
+  They provide
+  \begin{itemize}
+    \item A data set with 221 observations and 633 variables
+    \item A PDF with all measures and the scenario used for collecting the data
+  \end{itemize}
+  \vspace{.3cm}
+    \begin{block}{Exercise}
+    \begin{itemize}
+      \item Go to \url{https://osf.io/6y3nk/} and download the files
+        \texttt{data.csv} and
+        \texttt{Experimental-Study-Measures and scenario.pdf}
+      \item Read the data into R using \texttt{read.csv()}
+      \item Find out which variables in the data correspond to measure
+    ``(9)~Demographics''
+    \end{itemize}
+  \end{block}
+  \vspace{.3cm}
+  \pause
+  (BTW: Sharing the data in this form is better than \emph{not} sharing them,
+  in my opinion)
+\end{frame}
+
+\begin{frame}{What additional information do we need to use these data?}
+  % slido
+  \centering
+  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
+
+  \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
+\end{frame}
+
+\begin{frame}{TOSCA}
+  \begin{itemize}
+    \item Multi-cohort study that includes longitudinal data for several cohorts
+    \item Broad spectrum of achievement test data and psycho-social variables
+    \item Large number of publications on different topics using these data
+    \item This is not the original data set, but a prepared version for teaching
+      statistics (hence, proportions in the data and the codebook are not
+      identical)
+  \end{itemize}
+    \begin{block}{Exercise}
+    \begin{itemize}
+      \item Read the data set \texttt{TOSCAtoTeach\_W123.sav} into R using
+        \texttt{foreign::read.spss()} or \texttt{haven::read\_spss()}
+      \item Create contingency tables for the variables \texttt{sform} and
+        \texttt{szweig1} and compare the results to the codebook
+        \texttt{Skalenhandbuch\_TOSCAtoTeachW123.pdf}
+    \end{itemize}
+  \end{block}
+  \hfill{\tiny \url{https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/research/current-studies/tosca}}
+\end{frame}
+
+\begin{frame}{\citet{Wicherts2012}}
+  They provide
+  \begin{itemize}
+    \item A data set with 537 observations and 79 variables
+      (\texttt{1-s2.0-S0160289612000050-mmc2.xls})
+    \item A codebook with variable names and some descriptive statistics for
+      the scales (\texttt{1-s2.0-S0160289612000050-mmc1.doc})
+    \item ``Publish (your data) or (let the data) perish! Why not publish your
+      data too?''
+    \item Data come from freshman-testing program called ``Testweek''
+    \item (Try \texttt{readxl::read\_excel()} to read the data into R)
+  \end{itemize}
+  \vfill
+\end{frame}
+
+\begin{frame}{What is the single one thing that would make sharing these data
+  indefinitely better?}
+  % slido
+  \centering
+  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
+
+  \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
+\end{frame}
+
+\begin{frame}[<+->]{Non-anonymous data}
+  \begin{itemize}
+    \item Before putting data into any cloud, you should always take a moment to
+      reflect if your data are anonymous
+    \item No (third-party) cloud storage, even if it is not publicly accessible
+    \item If your data contains personal data, it should always be stored
+      locally, ideally on an encrypted device
+    \item You should have a plan --- bofore ever collecting the data --- how, 
+      when, and by whom the data will be anonymized
+    \item All data should eventually be anonymized! (Yes, even audio and video
+      data)
+    \item IWM servers can be considered local
+  \end{itemize}
+\end{frame}
+
+\section[Collaborative use]{Sharing data for collaborative use}
+
+\begin{frame}[<+->]{Working together with the same data}
+  \begin{itemize}
+    \item Part of data organisation is to think about who needs access to
+      your data
+    \item Often these are colleagues from the same lab and there is
+      infrastructure to share files and scripts easily
+    \item The IWM offers several solutions for sharing your data (internally and
+      externally)
+    \item When the end goal is to make the data public, it might be a good idea
+      to work together at a place where the data can go public at a certain
+      point in time
+    \item We will look at two possiblities: OSF and Github
+  \end{itemize}
+  \vfill
+\end{frame}
+
+\begin{frame}{IWM solutions}
+  IWM servers
+  \begin{itemize}
+    \item Nextcloud: \url{https://nextcloud.iwm-tuebingen.de/}
+    \item Gitea: \url{https://gitea.iwm-tuebingen.de/}
+    \item Shared drive: \texttt{Y:/}
+  \end{itemize}
+  \vspace{.4cm}
+  Microsoft servers
+  \begin{itemize}
+    \item OneDrive
+    \item Teams
+  \end{itemize}
+  \vfill
+  \pause
+  (Maybe check out the two tips of the week on this topic:
+  {\tiny
+  \url{https://iwmonline.sharepoint.com/sites/intranet/SitePages/direktorat/en/Interne-Kommunikation.aspx\#tip-of-the-week-tutorial-series}})
+\end{frame}
+
+\begin{frame}{Open Science Framework}
+  {\url{https://osf.io/}}
+  \begin{columns}
+  \begin{column}{.4\textwidth}
+  \begin{itemize}
+    \item ``OSF is a free and open source project management tool that supports
+      researchers throughout their entire project lifecycle.''
+    \item Founded in 2012 and constantly developed: \url{https://www.cos.io/timeline}
+    \item Meant to integrate all research steps
+  \end{itemize}
+  \end{column}
+  \begin{column}{.7\textwidth}
+    \includegraphics[scale = .2]{../figures/osf_workflow.png}
+  \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Let's try it out}
+  \begin{tikzpicture}[
+    every node/.style = {text width = 5.1cm, align = left},
+    every path/.style = {thick, draw}
+  ]
+    \node (ex) at  (0, 0) {\faIcon{folder} \verb+toyexample+};
+    \node (n1) at  (5, 0) {\faIcon{folder} \verb+code+};
+    \node (n2) at  (5, -1.4) {\faIcon{folder} \verb+data+};
+    \node (n3) at  (5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
+    \path (ex.center) -- (n1.west);
+    \path (ex.center) -- (n2.west);
+    \path (ex.center) -- (n3.west);
+
+    \node (o1a) at (10, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
+    \node (o1b) at (10, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
+    \node (o2)  at (10, -1.4) {\faIcon{folder} \verb+processed+};
+    \node (o3)  at (10, -2.1) {\faIcon{folder} \verb+rawdata+};
+    \node (o4)  at (10, -2.8) {\faIcon[regular]{file} \verb+codebook.pdf+};
+    \path (n1.center) -- (o1a.west);
+    \path (n1.center) -- (o1b.west);
+    \path (n2.center) -- (o2.west);
+    \path (n2.center) -- (o3.west);
+    \path (n2.center) -- (o4.west);
+  \end{tikzpicture}
+  Steps
+  \begin{enumerate}
+    \item You need an OSF account -- just sign up with an e-mail address or use ORCID
+    \item Sign in
+    \item Create a project
+    \item Upload (or link) your files
+    \item Invite contributors
+  \end{enumerate}
+\end{frame}
+
+% TODO:
+
+% Show different cases on OSF:
+% 1. OSF with handmade codebook, all in one folder
+% 2. OSF with different components (show that they can all have different
+% licenses)
+% 3. OSF with Github integrated
+
+% Show selection of servers (GDPR)
+
+\begin{frame}{Licenses}
+  \begin{columns}
+    \begin{column}{.3\textwidth}
+      \includegraphics[scale = .4]{../figures/licenses_osf.png}
+    \end{column}
+    \begin{column}{.7\textwidth}
+      \begin{itemize}
+        \item OSF offers you several options for licenses
+        \item For data the Creative Common (CC) licenses are usually a good option
+        \item For software, other options might be better suited
+        \item For code (e.\,g., analysis scripts) CC licenses are also a good
+          choice
+      \end{itemize}
+      \vspace{1cm}
+
+      \hfill{\footnotesize \url{https://creativecommons.org/}}\\
+      \hfill{\footnotesize \url{https://help.osf.io/article/288-license-your-project}}\\
+      \hfill{\footnotesize \url{https://choosealicense.com/}}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}{Github}
+  {\url{https://github.com/}}
+  \begin{columns}
+  \begin{column}{.8\textwidth}
+  \begin{itemize}
+    \item Developer platform that allows developers to create, store, manage and
+      share code
+    \item Based on Git software providing version control
+      \begin{itemize}
+        \item[+] access control
+        \item[+] bug tracking
+        \item[+] software feature requests
+        \item[+] task management
+        \item[+] continuous integration
+        \item[+] wikis
+      \end{itemize}
+    \item Commonly used to host open source software development projects
+    \item Bought by Microsoft in 2018 
+  \end{itemize}
+  \end{column}
+  \begin{column}{.3\textwidth}
+    \includegraphics[scale = .2]{../figures/github.png}
+  \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}{Github workflow}
+  \begin{center}
+    \includegraphics[scale = .3]{../figures/workflow_git-github.png}
+  \end{center}
+  \hfill{\tiny \url{https://carpentries-incubator.github.io/open-science-with-r/09-collaborating}}
+\end{frame}
+
+% TODO:
+
+% READMEs:
+% https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes
+
+\section[Repositories]{Sharing data in repositories}
+
+\begin{frame}{Data repositories}
+  National
+  \begin{itemize}
+    \item \url{https://www.psycharchives.org/}
+    \item \url{https://www.forschungsdaten-bildung.de/}
+    \item \url{https://datorium.gesis.org/}
+    \item \url{https://www.iqb.hu-berlin.de/fdz}
+  \end{itemize}
+  \vspace{.4cm}
+  International
+  \begin{itemize}
+    \item \url{https://datadryad.org/}
+    \item \url{https://osf.io/}
+    \item \url{https://zenodo.org/}
+  \end{itemize}
+  \vfill
+
+  \hfill{\footnotesize \url{https://datamanagement.hms.harvard.edu/share-publish/data-repositories}}
+\end{frame}
+
+\appendix
+%%\begin{frame}[allowframebreaks]{References}
+\begin{frame}{References}
+%\renewcommand{\bibfont}{\small}
+  \printbibliography
+\vfill
+\end{frame}
+
+\begin{frame}{A codebook should include}
+  \begin{tabular}{lp{11cm}}
+    \hline
+    Variable name & Usually some abbreviation like \texttt{pna01} \\
+    Variable label & Brief description to identify variable \\
+    Question text & If applicable, exact wording from survey question \\
+    Values & Values variable can take (e.\,g, 1 to 5) \\
+    Value labels & If applicable, textual descriptions of the values \\
+    Statistics & For example, range, mean, standard deviation for
+    numeric variables; frequencies and percentages for categorical variables \\
+    Missing data & If applicable, values and labels of missing data \\
+    Notes & Additional notes, remarks, or comments; for measures or
+    questions from copyrighted instruments, the notes field can be used to
+    cite the source \\
+    \hline
+  \end{tabular}
+  \vfill
+
+  \hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983}
+\end{frame}
+
+\end{document}
+
--- a/README.md
+++ b/README.md
@ -20,6 +20,10 @@ Frazier, M. R., O'Hara, C. C., Jiang, N., & Halpern, B. S. (2017). Our path
 to better science in less time using open data science tools. _Nature
 Ecology & Evolution, 1_(6), 1-7. https://doi.org/10.1038/s41559-017-0160

+Wicherts, J. M., & Bakker, M. (2012).Publish (your data) or (let the data)
+perish! Why not publish your data too? _Intelligence, 40_(2), 73–76.
+https://doi.org/10.1016/j.intell.2012.01.004
+
 Wilbrandt, J. (2023). Research Data Management Intro Series: Coffee Lectures &
 Espresso Shots. https://doi.org/10.5281/zenodo.7573695

--- a/figures/QR
+++ b/figures/QR
--- a/figures/github.png
+++ b/figures/github.png
--- a/figures/licenses_osf.png
+++ b/figures/licenses_osf.png
--- a/figures/osf_workflow.png
+++ b/figures/osf_workflow.png
--- a/figures/workflow_git-github.png
+++ b/figures/workflow_git-github.png
--- a/literature/lit.bib
+++ b/literature/lit.bib
@ -8,6 +8,14 @@
  doi       = {10.1525/collabra.18684}
 }

+@book{Koeller2004,
+  title   = {Wege zur {H}ochschulreife in {B}aden-{W}{\"u}rttemberg: {TOSCA} -- {E}ine {U}ntersuchung an allgemein bildenden und beruflichen {G}ymnasien},
+  author  = {K{\"o}ller, Olaf and Watermann, Ralf and Trautwein, Ulrich and L{\"u}dtke, Oliver},
+  year    = {2004},
+  publisher = {Springer},
+  doi     = {10.1007/978-3-322-80906-3}
+}
+
@article{Lowndes2017,
  title     = {Our path to better science in less time using open data science tools},
  author    = {Lowndes, Julia S Stewart and Best, Benjamin D and Scarborough, Courtney and Afflerbach, Jamie C and Frazier, Melanie R and O'Hara, Casey C and Jiang, Ning and Halpern, Benjamin S},
@ -30,6 +38,29 @@
  doi       = {10.1177/2515245917747656}
 }

+@article{Ngo2023,
+  title     = {Spot the bot: Investigating user's detection cues for social bots and their willingness to verify Twitter profiles},
+  journal   = {Computers in Human Behavior},
+  volume    = {146},
+  pages     = {107819},
+  year      = {2023},
+  issn      = {0747-5632},
+  doi       = {https://doi.org/10.1016/j.chb.2023.107819},
+  url       = {https://www.sciencedirect.com/science/article/pii/S074756322300170X},
+  author    = {Thao Ngo and Magdalena Wischnewski and Rebecca Bernemann and Martin Jansen and Nicole Kr{\"a}mer}
+}
+
+@article{Wicherts2012,
+  title   = {Publish (your data) or (let the data) perish! {W}hy not publish your data too?},
+  author  = {Wicherts, Jelte M and Bakker, Marjan},
+  journal = {Intelligence},
+  volume  = {40},
+  number  = {2},
+  pages   = {73--76},
+  year    = {2012},
+  doi     = {10.1016/j.intell.2012.01.004}
+}
+
@misc{Wilbrandt2023,
  author    = {Wilbrandt, Jeanne},
  title     = {{Research Data Management Intro Series: Coffee Lectures \& Espresso Shots}},