Slides for fourth session

This commit is contained in:
Nora Wickelmaier 2024-06-23 14:43:31 +02:00
parent e723230ca5
commit 5cdb5fdb8f
9 changed files with 496 additions and 1 deletions

View File

@ -28,7 +28,7 @@ numbered, indicating the order they should be executed in.
The `data` folder contains all folders associated with data and its
documentation.
The `code` folder with contains different codebook options and R scripts that
The `codebook` folder contains different codebook options and R scripts that
create these codebooks. If the codebook is created by an R script, the script
and the codebook are named identically, e.g., `codebook_01.R` and
`codebook_01.xslx`.

View File

@ -0,0 +1,460 @@
\documentclass[aspectratio=169]{beamer}
\usepackage{listings}
%\usepackage[utf8]{inputenc}
\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
\addbibresource{../literature/lit.bib}
\usepackage{fancyvrb}
\usepackage{fontawesome5} % get icons
\usepackage{multirow}
\usepackage{color, colortbl}
\usepackage{tikz}
\usetikzlibrary{fit}
\usepackage[edges]{forest}
\lstset{language=R,%
backgroundcolor=\color{iwmgray!15!white},
basicstyle=\ttfamily\color{iwmgray},
frame=none,
commentstyle=\slshape\color{iwmgreen},
keywordstyle=\bfseries\color{iwmgray},
identifierstyle=\color{iwmpurple},
stringstyle=\color{iwmblue},
numbers=none,%left,numberstyle=\tiny,
basewidth={.5em, .4em},
showstringspaces=false,
emphstyle=\color{red!50!white}}
\makeatletter \def\newblock{\beamer@newblock} \makeatother
\beamertemplatenavigationsymbolsempty
\setbeamertemplate{itemize items}[circle]
\setbeamertemplate{section in toc}[circle]
\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
\setbeamercolor{block body}{bg=iwmorange!50!white}
\setbeamercolor{block title}{fg=white, bg=iwmorange}
% Definitions for biblatex
\setbeamercolor{bibliography entry note}{fg=iwmgray}
\setbeamercolor{bibliography entry author}{fg=iwmgray}
\setbeamertemplate{bibliography item}{}
\definecolor{iwmorange}{RGB}{255,105,0}
\definecolor{iwmgray}{RGB}{67,79,79}
\definecolor{iwmblue}{RGB}{60,180,220}
\definecolor{iwmgreen}{RGB}{145,200,110}
\definecolor{iwmpurple}{RGB}{120,0,75}
\setbeamercolor{title}{fg=iwmorange}
\setbeamercolor{frametitle}{fg=iwmorange}
\setbeamercolor{structure}{fg=iwmorange}
\setbeamercolor{normal text}{fg=iwmgray}
\setbeamercolor{author}{fg=iwmgray}
\setbeamercolor{date}{fg=iwmgray}
\newcommand{\vect}[1]{\mathbf{#1}}
\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\gvect}[1]{\boldsymbol{#1}}
\newcommand{\gmat}[1]{\boldsymbol{#1}}
\AtBeginSection[]{
\frame{
\tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
\setbeamertemplate{headline}{
\begin{beamercolorbox}{section in head}
\vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
\end{beamercolorbox}
}
\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
\title{Data sharing}
\author{Nora Wickelmaier}
\date{June 24, 2024}
\begin{document}
\begin{frame}{}
\thispagestyle{empty}
\titlepage
\end{frame}
\begin{frame}{What are the benefits of sharing your data?}
% slido
\centering
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
\end{frame}
\begin{frame}[<+->]{Benefits of sharing data}
Sharing data
\begin{itemize}
\item[\dots] ensures that data are not ultimately lost (save data for posterity)
\item[\dots] is consistent with scientific norms of openness and rigor
\item[\dots] increases citation scores of papers
\item[\dots] encourages more research because it enables secondary analyses
\item[\dots] facilitates subsequent reanalyses (correct errors, emphasize
robustness of original results)
\item[\dots] is demanded by most third party funding agencies
\end{itemize}
\vfill
\hfill\tiny \citet{Wicherts2012}
\end{frame}
\begin{frame}{Agenda}
\centering
\begin{tabular}{ll}
\hline
Date & Topic \\
\hline
2024-05-13 & Introduction to data management \\
2024-05-27 & Workflow \\
2024-06-10 & Data organisation\\
\only<1>{2024-06-24}\only<2>{\bf 2024-06-24} & \only<1>{Data sharing}\only<2>{\bf Data sharing} \\
2024-07-08 & Clean coding \\
2024-07-22 & Version control \\
\hline
\end{tabular}
\end{frame}
% uploading under a license (CC-BY....)
% loading data on an archive, repository etc...
% Doing the archive
% mportant things before the open-access data
% Where to store data for long-term accessibility (conventions?)
% Tools, where I should upload my final data
% Upload data before or after publishing a paper? Time mangement
\section{Data organisation}
\begin{frame}[<+->]{What we covered so far}
\begin{itemize}
\item What habits do we need for effective research data management?
\item What is a workflow and why do we need one?
\item What needs to be considered when naming files of a research project?
\item How to organize folders for a research project?
\item What metadata should be added to my research project?
\item What are good ways to document a data set?
\end{itemize}
\end{frame}
\begin{frame}{Examples for documenting data sets}
\begin{enumerate}
\item A recent paper with published data by \citet{Ngo2023} investigating
what cues are considered by Twitter users to identify social bots
\item A multi-cohort, longitudinal study by the Hector Research Institute of
Education Sciences and Psychology at the university of Tübingen:
Transformation of the secondary school system and academic careers
\citep[TOSCA,][]{Koeller2004}
\item Editorial on why to publish your data with an accompanying data set
by \citet{Wicherts2012}
\end{enumerate}
\vfill
\end{frame}
\begin{frame}{\citet{Ngo2023}}
They provide
\begin{itemize}
\item A data set with 221 observations and 633 variables
\item A PDF with all measures and the scenario used for collecting the data
\end{itemize}
\vspace{.3cm}
\begin{block}{Exercise}
\begin{itemize}
\item Go to \url{https://osf.io/6y3nk/} and download the files
\texttt{data.csv} and
\texttt{Experimental-Study-Measures and scenario.pdf}
\item Read the data into R using \texttt{read.csv()}
\item Find out which variables in the data correspond to measure
``(9)~Demographics''
\end{itemize}
\end{block}
\vspace{.3cm}
\pause
(BTW: Sharing the data in this form is better than \emph{not} sharing them,
in my opinion)
\end{frame}
\begin{frame}{What additional information do we need to use these data?}
% slido
\centering
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
\end{frame}
\begin{frame}{TOSCA}
\begin{itemize}
\item Multi-cohort study that includes longitudinal data for several cohorts
\item Broad spectrum of achievement test data and psycho-social variables
\item Large number of publications on different topics using these data
\item This is not the original data set, but a prepared version for teaching
statistics (hence, proportions in the data and the codebook are not
identical)
\end{itemize}
\begin{block}{Exercise}
\begin{itemize}
\item Read the data set \texttt{TOSCAtoTeach\_W123.sav} into R using
\texttt{foreign::read.spss()} or \texttt{haven::read\_spss()}
\item Create contingency tables for the variables \texttt{sform} and
\texttt{szweig1} and compare the results to the codebook
\texttt{Skalenhandbuch\_TOSCAtoTeachW123.pdf}
\end{itemize}
\end{block}
\hfill{\tiny \url{https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/research/current-studies/tosca}}
\end{frame}
\begin{frame}{\citet{Wicherts2012}}
They provide
\begin{itemize}
\item A data set with 537 observations and 79 variables
(\texttt{1-s2.0-S0160289612000050-mmc2.xls})
\item A codebook with variable names and some descriptive statistics for
the scales (\texttt{1-s2.0-S0160289612000050-mmc1.doc})
\item ``Publish (your data) or (let the data) perish! Why not publish your
data too?''
\item Data come from freshman-testing program called ``Testweek''
\item (Try \texttt{readxl::read\_excel()} to read the data into R)
\end{itemize}
\vfill
\end{frame}
\begin{frame}{What is the single one thing that would make sharing these data
indefinitely better?}
% slido
\centering
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4}
\url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd}
\end{frame}
\begin{frame}[<+->]{Non-anonymous data}
\begin{itemize}
\item Before putting data into any cloud, you should always take a moment to
reflect if your data are anonymous
\item No (third-party) cloud storage, even if it is not publicly accessible
\item If your data contains personal data, it should always be stored
locally, ideally on an encrypted device
\item You should have a plan --- bofore ever collecting the data --- how,
when, and by whom the data will be anonymized
\item All data should eventually be anonymized! (Yes, even audio and video
data)
\item IWM servers can be considered local
\end{itemize}
\end{frame}
\section[Collaborative use]{Sharing data for collaborative use}
\begin{frame}[<+->]{Working together with the same data}
\begin{itemize}
\item Part of data organisation is to think about who needs access to
your data
\item Often these are colleagues from the same lab and there is
infrastructure to share files and scripts easily
\item The IWM offers several solutions for sharing your data (internally and
externally)
\item When the end goal is to make the data public, it might be a good idea
to work together at a place where the data can go public at a certain
point in time
\item We will look at two possiblities: OSF and Github
\end{itemize}
\vfill
\end{frame}
\begin{frame}{IWM solutions}
IWM servers
\begin{itemize}
\item Nextcloud: \url{https://nextcloud.iwm-tuebingen.de/}
\item Gitea: \url{https://gitea.iwm-tuebingen.de/}
\item Shared drive: \texttt{Y:/}
\end{itemize}
\vspace{.4cm}
Microsoft servers
\begin{itemize}
\item OneDrive
\item Teams
\end{itemize}
\vfill
\pause
(Maybe check out the two tips of the week on this topic:
{\tiny
\url{https://iwmonline.sharepoint.com/sites/intranet/SitePages/direktorat/en/Interne-Kommunikation.aspx\#tip-of-the-week-tutorial-series}})
\end{frame}
\begin{frame}{Open Science Framework}
{\url{https://osf.io/}}
\begin{columns}
\begin{column}{.4\textwidth}
\begin{itemize}
\item ``OSF is a free and open source project management tool that supports
researchers throughout their entire project lifecycle.''
\item Founded in 2012 and constantly developed: \url{https://www.cos.io/timeline}
\item Meant to integrate all research steps
\end{itemize}
\end{column}
\begin{column}{.7\textwidth}
\includegraphics[scale = .2]{../figures/osf_workflow.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[fragile]{Let's try it out}
\begin{tikzpicture}[
every node/.style = {text width = 5.1cm, align = left},
every path/.style = {thick, draw}
]
\node (ex) at (0, 0) {\faIcon{folder} \verb+toyexample+};
\node (n1) at (5, 0) {\faIcon{folder} \verb+code+};
\node (n2) at (5, -1.4) {\faIcon{folder} \verb+data+};
\node (n3) at (5, -2.8) {\faIcon[regular]{file} \verb+README.md+};
\path (ex.center) -- (n1.west);
\path (ex.center) -- (n2.west);
\path (ex.center) -- (n3.west);
\node (o1a) at (10, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+};
\node (o1b) at (10, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+};
\node (o2) at (10, -1.4) {\faIcon{folder} \verb+processed+};
\node (o3) at (10, -2.1) {\faIcon{folder} \verb+rawdata+};
\node (o4) at (10, -2.8) {\faIcon[regular]{file} \verb+codebook.pdf+};
\path (n1.center) -- (o1a.west);
\path (n1.center) -- (o1b.west);
\path (n2.center) -- (o2.west);
\path (n2.center) -- (o3.west);
\path (n2.center) -- (o4.west);
\end{tikzpicture}
Steps
\begin{enumerate}
\item You need an OSF account -- just sign up with an e-mail address or use ORCID
\item Sign in
\item Create a project
\item Upload (or link) your files
\item Invite contributors
\end{enumerate}
\end{frame}
% TODO:
% Show different cases on OSF:
% 1. OSF with handmade codebook, all in one folder
% 2. OSF with different components (show that they can all have different
% licenses)
% 3. OSF with Github integrated
% Show selection of servers (GDPR)
\begin{frame}{Licenses}
\begin{columns}
\begin{column}{.3\textwidth}
\includegraphics[scale = .4]{../figures/licenses_osf.png}
\end{column}
\begin{column}{.7\textwidth}
\begin{itemize}
\item OSF offers you several options for licenses
\item For data the Creative Common (CC) licenses are usually a good option
\item For software, other options might be better suited
\item For code (e.\,g., analysis scripts) CC licenses are also a good
choice
\end{itemize}
\vspace{1cm}
\hfill{\footnotesize \url{https://creativecommons.org/}}\\
\hfill{\footnotesize \url{https://help.osf.io/article/288-license-your-project}}\\
\hfill{\footnotesize \url{https://choosealicense.com/}}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Github}
{\url{https://github.com/}}
\begin{columns}
\begin{column}{.8\textwidth}
\begin{itemize}
\item Developer platform that allows developers to create, store, manage and
share code
\item Based on Git software providing version control
\begin{itemize}
\item[+] access control
\item[+] bug tracking
\item[+] software feature requests
\item[+] task management
\item[+] continuous integration
\item[+] wikis
\end{itemize}
\item Commonly used to host open source software development projects
\item Bought by Microsoft in 2018
\end{itemize}
\end{column}
\begin{column}{.3\textwidth}
\includegraphics[scale = .2]{../figures/github.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Github workflow}
\begin{center}
\includegraphics[scale = .3]{../figures/workflow_git-github.png}
\end{center}
\hfill{\tiny \url{https://carpentries-incubator.github.io/open-science-with-r/09-collaborating}}
\end{frame}
% TODO:
% READMEs:
% https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes
\section[Repositories]{Sharing data in repositories}
\begin{frame}{Data repositories}
National
\begin{itemize}
\item \url{https://www.psycharchives.org/}
\item \url{https://www.forschungsdaten-bildung.de/}
\item \url{https://datorium.gesis.org/}
\item \url{https://www.iqb.hu-berlin.de/fdz}
\end{itemize}
\vspace{.4cm}
International
\begin{itemize}
\item \url{https://datadryad.org/}
\item \url{https://osf.io/}
\item \url{https://zenodo.org/}
\end{itemize}
\vfill
\hfill{\footnotesize \url{https://datamanagement.hms.harvard.edu/share-publish/data-repositories}}
\end{frame}
\appendix
%%\begin{frame}[allowframebreaks]{References}
\begin{frame}{References}
%\renewcommand{\bibfont}{\small}
\printbibliography
\vfill
\end{frame}
\begin{frame}{A codebook should include}
\begin{tabular}{lp{11cm}}
\hline
Variable name & Usually some abbreviation like \texttt{pna01} \\
Variable label & Brief description to identify variable \\
Question text & If applicable, exact wording from survey question \\
Values & Values variable can take (e.\,g, 1 to 5) \\
Value labels & If applicable, textual descriptions of the values \\
Statistics & For example, range, mean, standard deviation for
numeric variables; frequencies and percentages for categorical variables \\
Missing data & If applicable, values and labels of missing data \\
Notes & Additional notes, remarks, or comments; for measures or
questions from copyrighted instruments, the notes field can be used to
cite the source \\
\hline
\end{tabular}
\vfill
\hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983}
\end{frame}
\end{document}

View File

@ -20,6 +20,10 @@ Frazier, M. R., O'Hara, C. C., Jiang, N., & Halpern, B. S. (2017). Our path
to better science in less time using open data science tools. _Nature
Ecology & Evolution, 1_(6), 1-7. https://doi.org/10.1038/s41559-017-0160
Wicherts, J. M., & Bakker, M. (2012).Publish (your data) or (let the data)
perish! Why not publish your data too? _Intelligence, 40_(2), 7376.
https://doi.org/10.1016/j.intell.2012.01.004
Wilbrandt, J. (2023). Research Data Management Intro Series: Coffee Lectures &
Espresso Shots. https://doi.org/10.5281/zenodo.7573695

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.6 KiB

BIN
figures/github.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
figures/licenses_osf.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

BIN
figures/osf_workflow.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 529 KiB

View File

@ -8,6 +8,14 @@
doi = {10.1525/collabra.18684}
}
@book{Koeller2004,
title = {Wege zur {H}ochschulreife in {B}aden-{W}{\"u}rttemberg: {TOSCA} -- {E}ine {U}ntersuchung an allgemein bildenden und beruflichen {G}ymnasien},
author = {K{\"o}ller, Olaf and Watermann, Ralf and Trautwein, Ulrich and L{\"u}dtke, Oliver},
year = {2004},
publisher = {Springer},
doi = {10.1007/978-3-322-80906-3}
}
@article{Lowndes2017,
title = {Our path to better science in less time using open data science tools},
author = {Lowndes, Julia S Stewart and Best, Benjamin D and Scarborough, Courtney and Afflerbach, Jamie C and Frazier, Melanie R and O'Hara, Casey C and Jiang, Ning and Halpern, Benjamin S},
@ -30,6 +38,29 @@
doi = {10.1177/2515245917747656}
}
@article{Ngo2023,
title = {Spot the bot: Investigating user's detection cues for social bots and their willingness to verify Twitter profiles},
journal = {Computers in Human Behavior},
volume = {146},
pages = {107819},
year = {2023},
issn = {0747-5632},
doi = {https://doi.org/10.1016/j.chb.2023.107819},
url = {https://www.sciencedirect.com/science/article/pii/S074756322300170X},
author = {Thao Ngo and Magdalena Wischnewski and Rebecca Bernemann and Martin Jansen and Nicole Kr{\"a}mer}
}
@article{Wicherts2012,
title = {Publish (your data) or (let the data) perish! {W}hy not publish your data too?},
author = {Wicherts, Jelte M and Bakker, Marjan},
journal = {Intelligence},
volume = {40},
number = {2},
pages = {73--76},
year = {2012},
doi = {10.1016/j.intell.2012.01.004}
}
@misc{Wilbrandt2023,
author = {Wilbrandt, Jeanne},
title = {{Research Data Management Intro Series: Coffee Lectures \& Espresso Shots}},