diff --git a/03_data_organisation/example/README.md b/03_data_organisation/example/README.md index 940601a..5162ad9 100644 --- a/03_data_organisation/example/README.md +++ b/03_data_organisation/example/README.md @@ -28,7 +28,7 @@ numbered, indicating the order they should be executed in. The `data` folder contains all folders associated with data and its documentation. -The `code` folder with contains different codebook options and R scripts that +The `codebook` folder contains different codebook options and R scripts that create these codebooks. If the codebook is created by an R script, the script and the codebook are named identically, e.g., `codebook_01.R` and `codebook_01.xslx`. diff --git a/04_data_sharing/04_data_sharing.tex b/04_data_sharing/04_data_sharing.tex new file mode 100644 index 0000000..263ceea --- /dev/null +++ b/04_data_sharing/04_data_sharing.tex @@ -0,0 +1,460 @@ +\documentclass[aspectratio=169]{beamer} + +\usepackage{listings} +%\usepackage[utf8]{inputenc} +\usepackage[style = apa, backend = biber, natbib = true]{biblatex} +\addbibresource{../literature/lit.bib} + +\usepackage{fancyvrb} +\usepackage{fontawesome5} % get icons +\usepackage{multirow} +\usepackage{color, colortbl} + +\usepackage{tikz} +\usetikzlibrary{fit} +\usepackage[edges]{forest} + +\lstset{language=R,% + backgroundcolor=\color{iwmgray!15!white}, + basicstyle=\ttfamily\color{iwmgray}, + frame=none, + commentstyle=\slshape\color{iwmgreen}, + keywordstyle=\bfseries\color{iwmgray}, + identifierstyle=\color{iwmpurple}, + stringstyle=\color{iwmblue}, + numbers=none,%left,numberstyle=\tiny, + basewidth={.5em, .4em}, + showstringspaces=false, + emphstyle=\color{red!50!white}} + +\makeatletter \def\newblock{\beamer@newblock} \makeatother + +\beamertemplatenavigationsymbolsempty +\setbeamertemplate{itemize items}[circle] +\setbeamertemplate{section in toc}[circle] +\mode{\setbeamercolor{math text displayed}{fg=iwmgray}} +\setbeamercolor{block body}{bg=iwmorange!50!white} +\setbeamercolor{block title}{fg=white, bg=iwmorange} +% Definitions for biblatex +\setbeamercolor{bibliography entry note}{fg=iwmgray} +\setbeamercolor{bibliography entry author}{fg=iwmgray} +\setbeamertemplate{bibliography item}{} + +\definecolor{iwmorange}{RGB}{255,105,0} +\definecolor{iwmgray}{RGB}{67,79,79} +\definecolor{iwmblue}{RGB}{60,180,220} +\definecolor{iwmgreen}{RGB}{145,200,110} +\definecolor{iwmpurple}{RGB}{120,0,75} + +\setbeamercolor{title}{fg=iwmorange} +\setbeamercolor{frametitle}{fg=iwmorange} +\setbeamercolor{structure}{fg=iwmorange} +\setbeamercolor{normal text}{fg=iwmgray} +\setbeamercolor{author}{fg=iwmgray} +\setbeamercolor{date}{fg=iwmgray} + +\newcommand{\vect}[1]{\mathbf{#1}} +\newcommand{\mat}[1]{\mathbf{#1}} +\newcommand{\gvect}[1]{\boldsymbol{#1}} +\newcommand{\gmat}[1]{\boldsymbol{#1}} + +\AtBeginSection[]{ + \frame{ + \tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}} + +\setbeamertemplate{headline}{ + \begin{beamercolorbox}{section in head} + \vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt + \end{beamercolorbox} +} + +\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt} + +\title{Data sharing} +\author{Nora Wickelmaier} +\date{June 24, 2024} + +\begin{document} + +\begin{frame}{} +\thispagestyle{empty} +\titlepage +\end{frame} + +\begin{frame}{What are the benefits of sharing your data?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4} + + \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd} +\end{frame} + +\begin{frame}[<+->]{Benefits of sharing data} + Sharing data + \begin{itemize} + \item[\dots] ensures that data are not ultimately lost (save data for posterity) + \item[\dots] is consistent with scientific norms of openness and rigor + \item[\dots] increases citation scores of papers + \item[\dots] encourages more research because it enables secondary analyses + \item[\dots] facilitates subsequent reanalyses (correct errors, emphasize + robustness of original results) + \item[\dots] is demanded by most third party funding agencies + \end{itemize} + \vfill + \hfill\tiny \citet{Wicherts2012} +\end{frame} + +\begin{frame}{Agenda} +\centering +\begin{tabular}{ll} +\hline +Date & Topic \\ +\hline +2024-05-13 & Introduction to data management \\ +2024-05-27 & Workflow \\ +2024-06-10 & Data organisation\\ +\only<1>{2024-06-24}\only<2>{\bf 2024-06-24} & \only<1>{Data sharing}\only<2>{\bf Data sharing} \\ +2024-07-08 & Clean coding \\ +2024-07-22 & Version control \\ +\hline +\end{tabular} +\end{frame} + +% uploading under a license (CC-BY....) +% loading data on an archive, repository etc... +% Doing the archive +% mportant things before the open-access data +% Where to store data for long-term accessibility (conventions?) +% Tools, where I should upload my final data +% Upload data before or after publishing a paper? Time mangement + +\section{Data organisation} + +\begin{frame}[<+->]{What we covered so far} + \begin{itemize} + \item What habits do we need for effective research data management? + \item What is a workflow and why do we need one? + \item What needs to be considered when naming files of a research project? + \item How to organize folders for a research project? + \item What metadata should be added to my research project? + \item What are good ways to document a data set? + \end{itemize} +\end{frame} + +\begin{frame}{Examples for documenting data sets} + \begin{enumerate} + \item A recent paper with published data by \citet{Ngo2023} investigating + what cues are considered by Twitter users to identify social bots + \item A multi-cohort, longitudinal study by the Hector Research Institute of + Education Sciences and Psychology at the university of Tübingen: + Transformation of the secondary school system and academic careers + \citep[TOSCA,][]{Koeller2004} + \item Editorial on why to publish your data with an accompanying data set + by \citet{Wicherts2012} + \end{enumerate} + \vfill +\end{frame} + +\begin{frame}{\citet{Ngo2023}} + They provide + \begin{itemize} + \item A data set with 221 observations and 633 variables + \item A PDF with all measures and the scenario used for collecting the data + \end{itemize} + \vspace{.3cm} + \begin{block}{Exercise} + \begin{itemize} + \item Go to \url{https://osf.io/6y3nk/} and download the files + \texttt{data.csv} and + \texttt{Experimental-Study-Measures and scenario.pdf} + \item Read the data into R using \texttt{read.csv()} + \item Find out which variables in the data correspond to measure + ``(9)~Demographics'' + \end{itemize} + \end{block} + \vspace{.3cm} + \pause + (BTW: Sharing the data in this form is better than \emph{not} sharing them, + in my opinion) +\end{frame} + +\begin{frame}{What additional information do we need to use these data?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4} + + \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd} +\end{frame} + +\begin{frame}{TOSCA} + \begin{itemize} + \item Multi-cohort study that includes longitudinal data for several cohorts + \item Broad spectrum of achievement test data and psycho-social variables + \item Large number of publications on different topics using these data + \item This is not the original data set, but a prepared version for teaching + statistics (hence, proportions in the data and the codebook are not + identical) + \end{itemize} + \begin{block}{Exercise} + \begin{itemize} + \item Read the data set \texttt{TOSCAtoTeach\_W123.sav} into R using + \texttt{foreign::read.spss()} or \texttt{haven::read\_spss()} + \item Create contingency tables for the variables \texttt{sform} and + \texttt{szweig1} and compare the results to the codebook + \texttt{Skalenhandbuch\_TOSCAtoTeachW123.pdf} + \end{itemize} + \end{block} + \hfill{\tiny \url{https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/research/current-studies/tosca}} +\end{frame} + +\begin{frame}{\citet{Wicherts2012}} + They provide + \begin{itemize} + \item A data set with 537 observations and 79 variables + (\texttt{1-s2.0-S0160289612000050-mmc2.xls}) + \item A codebook with variable names and some descriptive statistics for + the scales (\texttt{1-s2.0-S0160289612000050-mmc1.doc}) + \item ``Publish (your data) or (let the data) perish! Why not publish your + data too?'' + \item Data come from freshman-testing program called ``Testweek'' + \item (Try \texttt{readxl::read\_excel()} to read the data into R) + \end{itemize} + \vfill +\end{frame} + +\begin{frame}{What is the single one thing that would make sharing these data + indefinitely better?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 4} + + \url{https://app.sli.do/event/m5FEcBYkqtVAsjkdTsKsmd} +\end{frame} + +\begin{frame}[<+->]{Non-anonymous data} + \begin{itemize} + \item Before putting data into any cloud, you should always take a moment to + reflect if your data are anonymous + \item No (third-party) cloud storage, even if it is not publicly accessible + \item If your data contains personal data, it should always be stored + locally, ideally on an encrypted device + \item You should have a plan --- bofore ever collecting the data --- how, + when, and by whom the data will be anonymized + \item All data should eventually be anonymized! (Yes, even audio and video + data) + \item IWM servers can be considered local + \end{itemize} +\end{frame} + +\section[Collaborative use]{Sharing data for collaborative use} + +\begin{frame}[<+->]{Working together with the same data} + \begin{itemize} + \item Part of data organisation is to think about who needs access to + your data + \item Often these are colleagues from the same lab and there is + infrastructure to share files and scripts easily + \item The IWM offers several solutions for sharing your data (internally and + externally) + \item When the end goal is to make the data public, it might be a good idea + to work together at a place where the data can go public at a certain + point in time + \item We will look at two possiblities: OSF and Github + \end{itemize} + \vfill +\end{frame} + +\begin{frame}{IWM solutions} + IWM servers + \begin{itemize} + \item Nextcloud: \url{https://nextcloud.iwm-tuebingen.de/} + \item Gitea: \url{https://gitea.iwm-tuebingen.de/} + \item Shared drive: \texttt{Y:/} + \end{itemize} + \vspace{.4cm} + Microsoft servers + \begin{itemize} + \item OneDrive + \item Teams + \end{itemize} + \vfill + \pause + (Maybe check out the two tips of the week on this topic: + {\tiny + \url{https://iwmonline.sharepoint.com/sites/intranet/SitePages/direktorat/en/Interne-Kommunikation.aspx\#tip-of-the-week-tutorial-series}}) +\end{frame} + +\begin{frame}{Open Science Framework} + {\url{https://osf.io/}} + \begin{columns} + \begin{column}{.4\textwidth} + \begin{itemize} + \item ``OSF is a free and open source project management tool that supports + researchers throughout their entire project lifecycle.'' + \item Founded in 2012 and constantly developed: \url{https://www.cos.io/timeline} + \item Meant to integrate all research steps + \end{itemize} + \end{column} + \begin{column}{.7\textwidth} + \includegraphics[scale = .2]{../figures/osf_workflow.png} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}[fragile]{Let's try it out} + \begin{tikzpicture}[ + every node/.style = {text width = 5.1cm, align = left}, + every path/.style = {thick, draw} + ] + \node (ex) at (0, 0) {\faIcon{folder} \verb+toyexample+}; + \node (n1) at (5, 0) {\faIcon{folder} \verb+code+}; + \node (n2) at (5, -1.4) {\faIcon{folder} \verb+data+}; + \node (n3) at (5, -2.8) {\faIcon[regular]{file} \verb+README.md+}; + \path (ex.center) -- (n1.west); + \path (ex.center) -- (n2.west); + \path (ex.center) -- (n3.west); + + \node (o1a) at (10, 0) {\faIcon[regular]{file} \verb+01_preprocessing.R+}; + \node (o1b) at (10, -0.7) {\faIcon[regular]{file} \verb+02_descriptives.R+}; + \node (o2) at (10, -1.4) {\faIcon{folder} \verb+processed+}; + \node (o3) at (10, -2.1) {\faIcon{folder} \verb+rawdata+}; + \node (o4) at (10, -2.8) {\faIcon[regular]{file} \verb+codebook.pdf+}; + \path (n1.center) -- (o1a.west); + \path (n1.center) -- (o1b.west); + \path (n2.center) -- (o2.west); + \path (n2.center) -- (o3.west); + \path (n2.center) -- (o4.west); + \end{tikzpicture} + Steps + \begin{enumerate} + \item You need an OSF account -- just sign up with an e-mail address or use ORCID + \item Sign in + \item Create a project + \item Upload (or link) your files + \item Invite contributors + \end{enumerate} +\end{frame} + +% TODO: + +% Show different cases on OSF: +% 1. OSF with handmade codebook, all in one folder +% 2. OSF with different components (show that they can all have different +% licenses) +% 3. OSF with Github integrated + +% Show selection of servers (GDPR) + +\begin{frame}{Licenses} + \begin{columns} + \begin{column}{.3\textwidth} + \includegraphics[scale = .4]{../figures/licenses_osf.png} + \end{column} + \begin{column}{.7\textwidth} + \begin{itemize} + \item OSF offers you several options for licenses + \item For data the Creative Common (CC) licenses are usually a good option + \item For software, other options might be better suited + \item For code (e.\,g., analysis scripts) CC licenses are also a good + choice + \end{itemize} + \vspace{1cm} + + \hfill{\footnotesize \url{https://creativecommons.org/}}\\ + \hfill{\footnotesize \url{https://help.osf.io/article/288-license-your-project}}\\ + \hfill{\footnotesize \url{https://choosealicense.com/}} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}{Github} + {\url{https://github.com/}} + \begin{columns} + \begin{column}{.8\textwidth} + \begin{itemize} + \item Developer platform that allows developers to create, store, manage and + share code + \item Based on Git software providing version control + \begin{itemize} + \item[+] access control + \item[+] bug tracking + \item[+] software feature requests + \item[+] task management + \item[+] continuous integration + \item[+] wikis + \end{itemize} + \item Commonly used to host open source software development projects + \item Bought by Microsoft in 2018 + \end{itemize} + \end{column} + \begin{column}{.3\textwidth} + \includegraphics[scale = .2]{../figures/github.png} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}{Github workflow} + \begin{center} + \includegraphics[scale = .3]{../figures/workflow_git-github.png} + \end{center} + \hfill{\tiny \url{https://carpentries-incubator.github.io/open-science-with-r/09-collaborating}} +\end{frame} + +% TODO: + +% READMEs: +% https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes + +\section[Repositories]{Sharing data in repositories} + +\begin{frame}{Data repositories} + National + \begin{itemize} + \item \url{https://www.psycharchives.org/} + \item \url{https://www.forschungsdaten-bildung.de/} + \item \url{https://datorium.gesis.org/} + \item \url{https://www.iqb.hu-berlin.de/fdz} + \end{itemize} + \vspace{.4cm} + International + \begin{itemize} + \item \url{https://datadryad.org/} + \item \url{https://osf.io/} + \item \url{https://zenodo.org/} + \end{itemize} + \vfill + + \hfill{\footnotesize \url{https://datamanagement.hms.harvard.edu/share-publish/data-repositories}} +\end{frame} + +\appendix +%%\begin{frame}[allowframebreaks]{References} +\begin{frame}{References} +%\renewcommand{\bibfont}{\small} + \printbibliography +\vfill +\end{frame} + +\begin{frame}{A codebook should include} + \begin{tabular}{lp{11cm}} + \hline + Variable name & Usually some abbreviation like \texttt{pna01} \\ + Variable label & Brief description to identify variable \\ + Question text & If applicable, exact wording from survey question \\ + Values & Values variable can take (e.\,g, 1 to 5) \\ + Value labels & If applicable, textual descriptions of the values \\ + Statistics & For example, range, mean, standard deviation for + numeric variables; frequencies and percentages for categorical variables \\ + Missing data & If applicable, values and labels of missing data \\ + Notes & Additional notes, remarks, or comments; for measures or + questions from copyrighted instruments, the notes field can be used to + cite the source \\ + \hline + \end{tabular} + \vfill + + \hfill\tiny \url{https://www.icpsr.umich.edu/web/ICPSR/cms/1983} +\end{frame} + +\end{document} + diff --git a/README.md b/README.md index 7aa8cb8..5a1a50f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,10 @@ Frazier, M. R., O'Hara, C. C., Jiang, N., & Halpern, B. S. (2017). Our path to better science in less time using open data science tools. _Nature Ecology & Evolution, 1_(6), 1-7. https://doi.org/10.1038/s41559-017-0160 +Wicherts, J. M., & Bakker, M. (2012).Publish (your data) or (let the data) +perish! Why not publish your data too? _Intelligence, 40_(2), 73–76. +https://doi.org/10.1016/j.intell.2012.01.004 + Wilbrandt, J. (2023). Research Data Management Intro Series: Coffee Lectures & Espresso Shots. https://doi.org/10.5281/zenodo.7573695 diff --git a/figures/QR Code for Methodenseminar SS 2024 - Session 4.png b/figures/QR Code for Methodenseminar SS 2024 - Session 4.png new file mode 100644 index 0000000..81a3c55 Binary files /dev/null and b/figures/QR Code for Methodenseminar SS 2024 - Session 4.png differ diff --git a/figures/github.png b/figures/github.png new file mode 100644 index 0000000..457b41e Binary files /dev/null and b/figures/github.png differ diff --git a/figures/licenses_osf.png b/figures/licenses_osf.png new file mode 100644 index 0000000..3075e72 Binary files /dev/null and b/figures/licenses_osf.png differ diff --git a/figures/osf_workflow.png b/figures/osf_workflow.png new file mode 100644 index 0000000..a39aab3 Binary files /dev/null and b/figures/osf_workflow.png differ diff --git a/figures/workflow_git-github.png b/figures/workflow_git-github.png new file mode 100644 index 0000000..8810e43 Binary files /dev/null and b/figures/workflow_git-github.png differ diff --git a/literature/lit.bib b/literature/lit.bib index 6d438f0..557f5e0 100644 --- a/literature/lit.bib +++ b/literature/lit.bib @@ -8,6 +8,14 @@ doi = {10.1525/collabra.18684} } +@book{Koeller2004, + title = {Wege zur {H}ochschulreife in {B}aden-{W}{\"u}rttemberg: {TOSCA} -- {E}ine {U}ntersuchung an allgemein bildenden und beruflichen {G}ymnasien}, + author = {K{\"o}ller, Olaf and Watermann, Ralf and Trautwein, Ulrich and L{\"u}dtke, Oliver}, + year = {2004}, + publisher = {Springer}, + doi = {10.1007/978-3-322-80906-3} +} + @article{Lowndes2017, title = {Our path to better science in less time using open data science tools}, author = {Lowndes, Julia S Stewart and Best, Benjamin D and Scarborough, Courtney and Afflerbach, Jamie C and Frazier, Melanie R and O'Hara, Casey C and Jiang, Ning and Halpern, Benjamin S}, @@ -30,6 +38,29 @@ doi = {10.1177/2515245917747656} } +@article{Ngo2023, + title = {Spot the bot: Investigating user's detection cues for social bots and their willingness to verify Twitter profiles}, + journal = {Computers in Human Behavior}, + volume = {146}, + pages = {107819}, + year = {2023}, + issn = {0747-5632}, + doi = {https://doi.org/10.1016/j.chb.2023.107819}, + url = {https://www.sciencedirect.com/science/article/pii/S074756322300170X}, + author = {Thao Ngo and Magdalena Wischnewski and Rebecca Bernemann and Martin Jansen and Nicole Kr{\"a}mer} +} + +@article{Wicherts2012, + title = {Publish (your data) or (let the data) perish! {W}hy not publish your data too?}, + author = {Wicherts, Jelte M and Bakker, Marjan}, + journal = {Intelligence}, + volume = {40}, + number = {2}, + pages = {73--76}, + year = {2012}, + doi = {10.1016/j.intell.2012.01.004} +} + @misc{Wilbrandt2023, author = {Wilbrandt, Jeanne}, title = {{Research Data Management Intro Series: Coffee Lectures \& Espresso Shots}},