Slides for fourth session

2024-06-23 14:43:31 +02:00
parent e723230ca5
commit 5cdb5fdb8f
9 changed files with 496 additions and 1 deletions
@@ -28,7 +28,7 @@ numbered, indicating the order they should be executed in.
 The `data` folder contains all folders associated with data and its
 documentation.
-The `code` folder with contains different codebook options and R scripts that
+The `codebook` folder contains different codebook options and R scripts that
 create these codebooks. If the codebook is created by an R script, the script
 and the codebook are named identically, e.g., `codebook_01.R` and
 `codebook_01.xslx`.
@@ -0,0 +1,460 @@
 \documentclass[aspectratio=169]{beamer}
 \usepackage{listings}
 %\usepackage[utf8]{inputenc}
 \usepackage[style = apa, backend = biber, natbib = true]{biblatex}
 \addbibresource{../literature/lit.bib}
 \usepackage{fancyvrb}
 \usepackage{fontawesome5}                % get icons
 \usepackage{multirow}
 \usepackage{color, colortbl}
 \usepackage{tikz}
 \usetikzlibrary{fit}
 \usepackage[edges]{forest}
 \lstset{language=R,%
  backgroundcolor=\color{iwmgray!15!white},
  basicstyle=\ttfamily\color{iwmgray},
  frame=none,
  commentstyle=\slshape\color{iwmgreen},
  keywordstyle=\bfseries\color{iwmgray},
  identifierstyle=\color{iwmpurple},
  stringstyle=\color{iwmblue},
  numbers=none,%left,numberstyle=\tiny,
  basewidth={.5em, .4em},
  showstringspaces=false,
  emphstyle=\color{red!50!white}}
 \makeatletter \def\newblock{\beamer@newblock} \makeatother
 \beamertemplatenavigationsymbolsempty
 \setbeamertemplate{itemize items}[circle]
 \setbeamertemplate{section in toc}[circle]
 \mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
 \setbeamercolor{block body}{bg=iwmorange!50!white}
 \setbeamercolor{block title}{fg=white, bg=iwmorange}
 % Definitions for biblatex
 \setbeamercolor{bibliography entry note}{fg=iwmgray}
 \setbeamercolor{bibliography entry author}{fg=iwmgray}
 \setbeamertemplate{bibliography item}{}
 \definecolor{iwmorange}{RGB}{255,105,0}
 \definecolor{iwmgray}{RGB}{67,79,79}
 \definecolor{iwmblue}{RGB}{60,180,220}
 \definecolor{iwmgreen}{RGB}{145,200,110}
 \definecolor{iwmpurple}{RGB}{120,0,75}
 \setbeamercolor{title}{fg=iwmorange}
 \setbeamercolor{frametitle}{fg=iwmorange}
 \setbeamercolor{structure}{fg=iwmorange}
 \setbeamercolor{normal text}{fg=iwmgray}
 \setbeamercolor{author}{fg=iwmgray}
 \setbeamercolor{date}{fg=iwmgray}
 \newcommand{\vect}[1]{\mathbf{#1}}
 \newcommand{\mat}[1]{\mathbf{#1}}
 \newcommand{\gvect}[1]{\boldsymbol{#1}}
 \newcommand{\gmat}[1]{\boldsymbol{#1}}
 \AtBeginSection[]{
  \frame{
    \tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
 \setbeamertemplate{headline}{
 \begin{beamercolorbox}{section in head}
   \vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
 \end{beamercolorbox}
 }
 \setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
 \title{Data sharing}
 \author{Nora Wickelmaier}
 \date{June 24, 2024}
 \begin{document}
 \begin{frame}{}
 \thispagestyle{empty}
 \titlepage
 \end{frame}
 \begin{frame}{What are the benefits of sharing your data?}
  % slido
  \centering
  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
  \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
 \end{frame}
 \begin{frame}[<+->]{Benefits of sharing data}
  Sharing data
  \begin{itemize}
    \item[\dots] ensures that data are not ultimately lost (save data for posterity)
    \item[\dots] is consistent with scientific norms of openness and rigor
    \item[\dots] increases citation scores of papers
    \item[\dots] encourages more research because it enables secondary analyses
    \item[\dots] facilitates subsequent reanalyses (correct errors, emphasize
      robustness of original results)
    \item[\dots] is demanded by most third party funding agencies
  \end{itemize}
  \vfill
  \hfill\tiny \citet{Wicherts2012}
 \end{frame}
 \begin{frame}{Agenda}
 \centering
 \begin{tabular}{ll}
 \hline
 Date & Topic \\
 \hline
 2024-05-13 & Introduction to data management \\
 2024-05-27 & Workflow \\
 2024-06-10 & Data organisation\\
 \only<1>{2024-06-24}\only<2>{\bf 2024-06-24} & \only<1>{Data sharing}\only<2>{\bf Data sharing}                    \\
 2024-07-08 & Clean coding                    \\
 2024-07-22 & Version control                 \\
 \hline
 \end{tabular}
 \end{frame}
 % uploading under a license (CC-BY....) 
 % loading data on an archive, repository etc... 
 % Doing the archive
 % mportant things before the open-access data
 % Where to store data for long-term accessibility (conventions?)
 % Tools, where I should upload my final data
 % Upload data before or after publishing a paper? Time mangement
 \section{Data organisation}
 \begin{frame}[<+->]{What we covered so far}
  \begin{itemize}
    \item What habits do we need for effective research data management?
    \item What is a workflow and why do we need one?
    \item What needs to be considered when naming files of a research project?
    \item How to organize folders for a research project?
    \item What metadata should be added to my research project?
    \item What are good ways to document a data set?
  \end{itemize}
 \end{frame}
 \begin{frame}{Examples for documenting data sets}
  \begin{enumerate}
    \item A recent paper with published data by \citet{Ngo2023} investigating
      what cues are considered by Twitter users to identify social bots
    \item A multi-cohort, longitudinal study by the Hector Research Institute of
      Education Sciences and Psychology at the university of Tübingen:
      Transformation of the secondary school system and academic careers
      \citep[TOSCA,][]{Koeller2004}
    \item Editorial on why to publish your data with an accompanying data set
      by \citet{Wicherts2012}
  \end{enumerate}
  \vfill
 \end{frame}
 \begin{frame}{\citet{Ngo2023}}
  They provide
  \begin{itemize}
    \item A data set with 221 observations and 633 variables
    \item A PDF with all measures and the scenario used for collecting the data
  \end{itemize}
  \vspace{.3cm}
    \begin{block}{Exercise}
    \begin{itemize}
      \item Go to \url{https://osf.io/6y3nk/} and download the files
        \texttt{data.csv} and
        \texttt{Experimental-Study-Measures and scenario.pdf}
      \item Read the data into R using \texttt{read.csv()}
      \item Find out which variables in the data correspond to measure
    ``(9)~Demographics''
    \end{itemize}
  \end{block}
  \vspace{.3cm}
  \pause
  (BTW: Sharing the data in this form is better than \emph{not} sharing them,
  in my opinion)
 \end{frame}
 \begin{frame}{What additional information do we need to use these data?}
  % slido
  \centering
  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
  \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
 \end{frame}
 \begin{frame}{TOSCA}
  \begin{itemize}
    \item Multi-cohort study that includes longitudinal data for several cohorts
    \item Broad spectrum of achievement test data and psycho-social variables
    \item Large number of publications on different topics using these data
    \item This is not the original data set, but a prepared version for teaching
      statistics (hence, proportions in the data and the codebook are not
      identical)
  \end{itemize}
    \begin{block}{Exercise}
    \begin{itemize}
      \item Read the data set \texttt{TOSCAtoTeach\_W123.sav} into R using
        \texttt{foreign::read.spss()} or \texttt{haven::read\_spss()}
      \item Create contingency tables for the variables \texttt{sform} and
        \texttt{szweig1} and compare the results to the codebook
        \texttt{Skalenhandbuch\_TOSCAtoTeachW123.pdf}
    \end{itemize}
  \end{block}
  \hfill{\tiny \url{https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/research/current-studies/tosca}}
 \end{frame}
 \begin{frame}{\citet{Wicherts2012}}
  They provide
  \begin{itemize}
    \item A data set with 537 observations and 79 variables
      (\texttt{1-s2.0-S0160289612000050-mmc2.xls})
    \item A codebook with variable names and some descriptive statistics for
      the scales (\texttt{1-s2.0-S0160289612000050-mmc1.doc})
    \item ``Publish (your data) or (let the data) perish! Why not publish your
      data too?''
    \item Data come from freshman-testing program called ``Testweek''
    \item (Try \texttt{readxl::read\_excel()} to read the data into R)
  \end{itemize}
  \vfill
 \end{frame}
 \begin{frame}{What is the single one thing that would make sharing these data
  indefinitely better?}
  % slido
  \centering
  \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
  \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
 \end{frame}
 \begin{frame}[<+->]{Non-anonymous data}
  \begin{itemize}
    \item Before putting data into any cloud, you should always take a moment to
      reflect if your data are anonymous
    \item No (third-party) cloud storage, even if it is not publicly accessible
    \item If your data contains personal data, it should always be stored
      locally, ideally on an encrypted device
    \item You should have a plan --- bofore ever collecting the data --- how, 
      when, and by whom the data will be anonymized
    \item All data should eventually be anonymized! (Yes, even audio and video
      data)
    \item IWM servers can be considered local
  \end{itemize}
 \end{frame}
 \section[Collaborative use]{Sharing data for collaborative use}
 \begin{frame}[<+->]{Working together with the same data}
  \begin{itemize}
    \item Part of data organisation is to think about who needs access to
      your data
    \item Often these are colleagues from the same lab and there is
      infrastructure to share files and scripts easily
    \item The IWM offers several solutions for sharing your data (internally and
      externally)
    \item When the end goal is to make the data public, it might be a good idea
      to work together at a place where the data can go public at a certain
      point in time
    \item We will look at two possiblities: OSF and Github
  \end{itemize}
  \vfill
 \end{frame}
 \begin{frame}{IWM solutions}
  IWM servers
  \begin{itemize}
    \item Nextcloud: \url{https://nextcloud.iwm-tuebingen.de/}
    \item Gitea: \url{https://gitea.iwm-tuebingen.de/}
    \item Shared drive: \texttt{Y:/}
  \end{itemize}
  \vspace{.4cm}
  Microsoft servers
  \begin{itemize}
    \item OneDrive
    \item Teams
  \end{itemize}
  \vfill
  \pause
  (Maybe check out the two tips of the week on this topic:
  {\tiny
  \url{https://iwmonline.sharepoint.com/sites/intranet/SitePages/direktorat/en/Interne-Kommunikation.aspx\#tip-of-the-week-tutorial-series}})
 \end{frame}
 \begin{frame}{Open Science Framework}
  {\url{https://osf.io/}}
  \begin{columns}
  \begin{column}{.4\textwidth}
  \begin{itemize}
    \item ``OSF is a free and open source project management tool that supports
      researchers throughout their entire project lifecycle.''
    \item Founded in 2012 and constantly developed: \url{https://www.cos.io/timeline}
    \item Meant to integrate all research steps
  \end{itemize}
  \end{column}
  \begin{column}{.7\textwidth}
    \includegraphics[scale = .2]{../figures/osf_workflow.png}
  \end{column}
  \end{columns}
 \end{frame}
 \begin{frame}[fragile]{Let's try it out}
  \begin{tikzpicture}[
    every node/.style = {text width = 5.1cm, align = left},
    every path/.style = {thick, draw}
  ]
    \node (ex) at  (0, 0) {\faIcon{folder} \verb+toyexample+};
    \node (n1) at  (5, 0) {\faIcon{folder} \verb+code+};
    \node (n2) at  (5, -1.4) {\faIcon{folder} \verb+data+};
    \node (n3) at  (5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
    \path (ex.center) -- (n1.west);
    \path (ex.center) -- (n2.west);
    \path (ex.center) -- (n3.west);
    \node (o1a) at (10, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
    \node (o1b) at (10, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
    \node (o2)  at (10, -1.4) {\faIcon{folder} \verb+processed+};
    \node (o3)  at (10, -2.1) {\faIcon{folder} \verb+rawdata+};
    \node (o4)  at (10, -2.8) {\faIcon[regular]{file} \verb+codebook.pdf+};
    \path (n1.center) -- (o1a.west);
    \path (n1.center) -- (o1b.west);
    \path (n2.center) -- (o2.west);
    \path (n2.center) -- (o3.west);
    \path (n2.center) -- (o4.west);
  \end{tikzpicture}
  Steps
  \begin{enumerate}
    \item You need an OSF account -- just sign up with an e-mail address or use ORCID
    \item Sign in
    \item Create a project
    \item Upload (or link) your files
    \item Invite contributors
  \end{enumerate}
 \end{frame}
 % TODO:
 % Show different cases on OSF:
 % 1. OSF with handmade codebook, all in one folder
 % 2. OSF with different components (show that they can all have different
 % licenses)
 % 3. OSF with Github integrated
 % Show selection of servers (GDPR)
 \begin{frame}{Licenses}
  \begin{columns}
    \begin{column}{.3\textwidth}
      \includegraphics[scale = .4]{../figures/licenses_osf.png}
    \end{column}
    \begin{column}{.7\textwidth}
      \begin{itemize}
        \item OSF offers you several options for licenses
        \item For data the Creative Common (CC) licenses are usually a good option
        \item For software, other options might be better suited
        \item For code (e.\,g., analysis scripts) CC licenses are also a good
          choice
      \end{itemize}
      \vspace{1cm}
      \hfill{\footnotesize \url{https://creativecommons.org/}}\\
      \hfill{\footnotesize \url{https://help.osf.io/article/288-license-your-project}}\\
      \hfill{\footnotesize \url{https://choosealicense.com/}}
    \end{column}
  \end{columns}
 \end{frame}
 \begin{frame}{Github}
  {\url{https://github.com/}}
  \begin{columns}
  \begin{column}{.8\textwidth}
  \begin{itemize}
    \item Developer platform that allows developers to create, store, manage and
      share code
    \item Based on Git software providing version control
      \begin{itemize}
        \item[+] access control
        \item[+] bug tracking
        \item[+] software feature requests
        \item[+] task management
        \item[+] continuous integration
        \item[+] wikis
      \end{itemize}
    \item Commonly used to host open source software development projects
    \item Bought by Microsoft in 2018 
  \end{itemize}
  \end{column}
  \begin{column}{.3\textwidth}
    \includegraphics[scale = .2]{../figures/github.png}
  \end{column}
  \end{columns}
 \end{frame}
 \begin{frame}{Github workflow}
  \begin{center}
    \includegraphics[scale = .3]{../figures/workflow_git-github.png}
  \end{center}
  \hfill{\tiny \url{https://carpentries-incubator.github.io/open-science-with-r/09-collaborating}}
 \end{frame}
 % TODO:
 % READMEs:
 % https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes
 \section[Repositories]{Sharing data in repositories}
 \begin{frame}{Data repositories}
  National
  \begin{itemize}
    \item \url{https://www.psycharchives.org/}
    \item \url{https://www.forschungsdaten-bildung.de/}
    \item \url{https://datorium.gesis.org/}
    \item \url{https://www.iqb.hu-berlin.de/fdz}
  \end{itemize}
  \vspace{.4cm}
  International
  \begin{itemize}
    \item \url{https://datadryad.org/}
    \item \url{https://osf.io/}
    \item \url{https://zenodo.org/}
  \end{itemize}
  \vfill
  \hfill{\footnotesize \url{https://datamanagement.hms.harvard.edu/share-publish/data-repositories}}
 \end{frame}
 \appendix
 %%\begin{frame}[allowframebreaks]{References}
 \begin{frame}{References}
 %\renewcommand{\bibfont}{\small}
  \printbibliography
 \vfill
 \end{frame}
 \begin{frame}{A codebook should include}
  \begin{tabular}{lp{11cm}}
    \hline
    Variable name & Usually some abbreviation like \texttt{pna01} \\
    Variable label & Brief description to identify variable \\
    Question text & If applicable, exact wording from survey question \\
    Values & Values variable can take (e.\,g, 1 to 5) \\
    Value labels & If applicable, textual descriptions of the values \\
    Statistics & For example, range, mean, standard deviation for
    numeric variables; frequencies and percentages for categorical variables \\
    Missing data & If applicable, values and labels of missing data \\
    Notes & Additional notes, remarks, or comments; for measures or
    questions from copyrighted instruments, the notes field can be used to
    cite the source \\
    \hline
  \end{tabular}
  \vfill
  \hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983}
 \end{frame}
 \end{document}
@@ -20,6 +20,10 @@ Frazier, M. R., O'Hara, C. C., Jiang, N., & Halpern, B. S. (2017). Our path
 to better science in less time using open data science tools. _Nature
 Ecology & Evolution, 1_(6), 1-7. https://doi.org/10.1038/s41559-017-0160
 Wicherts, J. M., & Bakker, M. (2012).Publish (your data) or (let the data)
 perish! Why not publish your data too? _Intelligence, 40_(2), 73–76.
 https://doi.org/10.1016/j.intell.2012.01.004
 Wilbrandt, J. (2023). Research Data Management Intro Series: Coffee Lectures &
 Espresso Shots. https://doi.org/10.5281/zenodo.7573695
@@ -8,6 +8,14 @@
  doi       = {10.1525/collabra.18684}
 }
@book{Koeller2004,
  title   = {Wege zur {H}ochschulreife in {B}aden-{W}{\"u}rttemberg: {TOSCA} -- {E}ine {U}ntersuchung an allgemein bildenden und beruflichen {G}ymnasien},
  author  = {K{\"o}ller, Olaf and Watermann, Ralf and Trautwein, Ulrich and L{\"u}dtke, Oliver},
  year    = {2004},
  publisher = {Springer},
  doi     = {10.1007/978-3-322-80906-3}
 }
@article{Lowndes2017,
  title     = {Our path to better science in less time using open data science tools},
  author    = {Lowndes, Julia S Stewart and Best, Benjamin D and Scarborough, Courtney and Afflerbach, Jamie C and Frazier, Melanie R and O'Hara, Casey C and Jiang, Ning and Halpern, Benjamin S},
@@ -30,6 +38,29 @@
  doi       = {10.1177/2515245917747656}
 }
@article{Ngo2023,
  title     = {Spot the bot: Investigating user's detection cues for social bots and their willingness to verify Twitter profiles},
  journal   = {Computers in Human Behavior},
  volume    = {146},
  pages     = {107819},
  year      = {2023},
  issn      = {0747-5632},
  doi       = {https://doi.org/10.1016/j.chb.2023.107819},
  url       = {https://www.sciencedirect.com/science/article/pii/S074756322300170X},
  author    = {Thao Ngo and Magdalena Wischnewski and Rebecca Bernemann and Martin Jansen and Nicole Kr{\"a}mer}
 }
@article{Wicherts2012,
  title   = {Publish (your data) or (let the data) perish! {W}hy not publish your data too?},
  author  = {Wicherts, Jelte M and Bakker, Marjan},
  journal = {Intelligence},
  volume  = {40},
  number  = {2},
  pages   = {73--76},
  year    = {2012},
  doi     = {10.1016/j.intell.2012.01.004}
 }
@misc{Wilbrandt2023,
  author    = {Wilbrandt, Jeanne},
  title     = {{Research Data Management Intro Series: Coffee Lectures \& Espresso Shots}},