diff --git a/05_clean_coding/05_clean_coding.tex b/05_clean_coding/05_clean_coding.tex new file mode 100644 index 0000000..c4e585e --- /dev/null +++ b/05_clean_coding/05_clean_coding.tex @@ -0,0 +1,503 @@ +\documentclass[aspectratio=169]{beamer} + +\usepackage{listings} +%\usepackage[utf8]{inputenc} +\usepackage[style = apa, backend = biber, natbib = true]{biblatex} +\addbibresource{../literature/lit.bib} + +\usepackage{fancyvrb} +\usepackage{fontawesome5} % get icons +\usepackage{multirow} +\usepackage{color, colortbl} + +\usepackage{tikz} +\usetikzlibrary{fit} +\usepackage[edges]{forest} + +\lstset{language = R,% + basicstyle = \ttfamily\color{iwmgray}, + frame = single, + rulecolor = \color{iwmgray}, + commentstyle = \slshape\color{iwmgreen}, + keywordstyle = \bfseries\color{iwmgray}, + identifierstyle = \color{iwmpurple}, + stringstyle = \color{iwmblue}, + numbers = none,%left,numberstyle = \tiny, + basewidth = {.5em, .4em}, + showstringspaces = false, + emphstyle = \color{red!50!white}} + +\makeatletter \def\newblock{\beamer@newblock} \makeatother + +\beamertemplatenavigationsymbolsempty +\setbeamertemplate{itemize items}[circle] +\setbeamertemplate{section in toc}[circle] +\mode{\setbeamercolor{math text displayed}{fg=iwmgray}} +\setbeamercolor{block body}{bg=iwmorange!50!white} +\setbeamercolor{block title}{fg=white, bg=iwmorange} +% Definitions for biblatex +\setbeamercolor{bibliography entry note}{fg=iwmgray} +\setbeamercolor{bibliography entry author}{fg=iwmgray} +\setbeamertemplate{bibliography item}{} + +\definecolor{iwmorange}{RGB}{255,105,0} +\definecolor{iwmgray}{RGB}{67,79,79} +\definecolor{iwmblue}{RGB}{60,180,220} +\definecolor{iwmgreen}{RGB}{145,200,110} +\definecolor{iwmpurple}{RGB}{120,0,75} + +\setbeamercolor{title}{fg=iwmorange} +\setbeamercolor{frametitle}{fg=iwmorange} +\setbeamercolor{structure}{fg=iwmorange} +\setbeamercolor{normal text}{fg=iwmgray} +\setbeamercolor{author}{fg=iwmgray} +\setbeamercolor{date}{fg=iwmgray} + +\newcommand{\vect}[1]{\mathbf{#1}} +\newcommand{\mat}[1]{\mathbf{#1}} +\newcommand{\gvect}[1]{\boldsymbol{#1}} +\newcommand{\gmat}[1]{\boldsymbol{#1}} + +\AtBeginSection[]{ + \frame{ + \tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}} + +\setbeamertemplate{headline}{ + \begin{beamercolorbox}{section in head} + \vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt + \end{beamercolorbox} +} + +\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt} + +\title{Clean coding} +\author{Nora Wickelmaier} +\date{July 8, 2024} + +\begin{document} + +\begin{frame}{} +\thispagestyle{empty} +\titlepage +\end{frame} + +\begin{frame}{What is needed to make code reproducible?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 5} + + \url{https://app.sli.do/event/uEz8fJWkLBNm1sthQovXNH} +\end{frame} + +\begin{frame}[fragile]{Programming resources} + \footnotesize + \begin{tabular}{ll} + Learning statistics with R & {\url{https://learningstatisticswithr.com/book/}} \\ + &\\ + R for Data Science & {\url{https://r4ds.hadley.nz/}} \\ + &\\ + Advanced R & {\url{https://adv-r.hadley.nz/}} \\ + &\\ + Happy Git and GitHub for the useR & {\url{https://happygitwithr.com/}} \\ + &\\ + R Programming for Research & {\url{https://geanders.github.io/RProgrammingForResearch/}} \\ + &\\ + Building reproducible analytical pipelines with R & {\url{https://raps-with-r.dev/}} \\ + &\\ + Data Skills for Reproducible Science & {\url{https://psyteachr.github.io/msc-data-skills/}} \\ + \end{tabular} +\end{frame} + +\begin{frame}{Agenda} +\centering +\begin{tabular}{ll} +\hline +Date & Topic \\ +\hline +2024-05-13 & Introduction to data management \\ +2024-05-27 & Workflow \\ +2024-06-10 & Data organisation\\ +2024-06-24 & Data sharing \\ +\only<1>{2024-07-08}\only<2>{\bf 2024-07-08} & +\only<1>{Clean coding}\only<2>{\bf Clean coding} \\ +2024-07-22 & Version control \\ +\hline +\end{tabular} +\end{frame} + +% Understandable coding +% Cleaning up R code for readability +% Documentation of a final R script +% Reproducible code + +\section{Style guidelines} + +\begin{frame}[<+->]{Style guidelines in R} + \begin{itemize} + \item R has no mandatory or commonly accepted style guide + \item However, Hadley Wickham and Google developed style guides which are + now widely accepted + \begin{itemize} + \item \url{https://google.github.io/styleguide/Rguide.html} + \item \url{https://style.tidyverse.org/} + \end{itemize} + \item It is always a good idea to follow a style guide and not ``create'' + your own rules (if you deviate, be consistent!) + \item A style guide helps with + \begin{itemize} + \item Keeping code clean which is easier to read and interpret + \item Making it easier to catch and fix mistakes + \item Making it easier for others to follow and adapt your code + \item Preventing possible problems, e.\,g., avoiding dots in function + names + \end{itemize} + \end{itemize} + \nocite{Wickham_styleguide, Anderson2023} +\end{frame} + +\begin{frame}[fragile, allowframebreaks]{File names} + \begin{itemize} + \item File names should be meaningful and end in .R + \item Avoid using special characters in file names + \item Stick with numbers, letters, \verb+-+, and \verb+_+ + \begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}] +# Good +fit_models.R +utility_functions.R + +# Bad +fit models.R +foo.r +stuff.r + \end{lstlisting} + \framebreak + + \item If files should be run in a particular order, prefix them with numbers + \item If it seems likely you’ll have more than 10 files, left pad with zero + \begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}] +00_download.R +01_explore.R +... +09_model.R +10_visualize.R + \end{lstlisting} + \item If you later realize that you missed some steps, it’s tempting to use + 02a, 02b, etc. + \item However, it is generally better to bite the bullet and rename all + files + \end{itemize} +\end{frame} + +\begin{frame}[fragile, allowframebreaks]{Object names} + \begin{itemize} + \item Variable and function names should use only lowercase letters, + numbers, and \verb+_+ + \item Use underscores (\verb+_+) (so called snake case) to separate words + within a name + \begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}] +# Good +day_one +day_1 + +# Bad +DayOne +dayone + \end{lstlisting} + \framebreak + + \item Generally, variable names should be nouns and function names should be + verbs + \item Strive for names that are concise and meaningful + \begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}] +# Good +day_one + +# Bad +first_day_of_the_month +djm1 + \end{lstlisting} + \framebreak + + \item Avoid re-using names of common functions and variables + \begin{lstlisting} +# Bad +T <- FALSE +c <- 10 +mean <- function(x) sum(x) + \end{lstlisting} + \end{itemize} +\end{frame} + +\begin{frame}[fragile, allowframebreaks]{Spacing} + \begin{itemize} + \item Always put a space after a comma, never before + \begin{lstlisting} +# Good +x[, 1] + +# Bad +x[,1] +x[ ,1] +x[ , 1] + \end{lstlisting} + \framebreak + + \item Do not put spaces inside or outside parentheses for regular function + calls + \begin{lstlisting} +# Good +mean(x, na.rm = TRUE) + +# Bad +mean (x, na.rm = TRUE) +mean( x, na.rm = TRUE ) + \end{lstlisting} + \framebreak + +\item Place a space before and after \texttt{()} when used with \texttt{if}, + \texttt{for}, or \texttt{while} + \begin{lstlisting} +# Good +if (debug) { + show(x) +} + +# Bad +if(debug){ + show(x) +} + \end{lstlisting} + \framebreak + +\item Place a space after \texttt{()} used for function arguments + \begin{lstlisting} +# Good +function(x) {} + +# Bad +function (x) {} +function(x){} + \end{lstlisting} + \framebreak + + \item Most infix operators (\verb+==+, \verb|+|, \verb+-+, \verb+<-+, etc.) + should always be surrounded by spaces + \begin{lstlisting} +# Good +height <- (feet * 12) + inches +mean(x, na.rm = TRUE) + +# Bad +height<-feet*12+inches +mean(x, na.rm=TRUE) + \end{lstlisting} + \framebreak + + \item There are a few exceptions, which should never be surrounded by + spaces: \verb+::+, \verb+:::+, \verb+$+, \verb+@+, \verb+[+, \verb+[[+, + \verb+?+, \verb+^+, and \verb+:+ + {\small + \begin{lstlisting} +# Good +sqrt(x^2 + y^2) +df$z +x <- 1:10 +package?stats +?mean + +# Bad +sqrt(x ^ 2 + y ^ 2) +df $ z +x <- 1 : 10 +package ? stats +? mean + \end{lstlisting} + } + \item Adding extra spaces is ok if it improves alignment of \verb+=+ or + \verb+<-+ + \begin{lstlisting} +# Good +list( + total = a + b + c, + mean = (a + b + c) / n +) + +# Also fine +list( + total = a + b + c, + mean = (a + b + c) / n +) + \end{lstlisting} + \end{itemize} +\end{frame} + +% CITE: +% https://style.tidyverse.org/index.html +% R Programming for Reserach: https://geanders.github.io/RProgrammingForResearch/ +% Building reproducible analytical pipelines with R: https://raps-with-r.dev/ + +\section{Script organisation} + +\begin{frame}[fragile]{Script header} + \begin{itemize} + \item It can be very helpful to have some general information right at the + top when opening a script + \begin{lstlisting} +# 01_preprocessing.R +# +# Cleaning up toy data set (Methods Seminar SS2024) +# +# Input: rawdata/RDM_MS_SS2024_download_2024-06-07.csv +# Output: processed/data_rdm-ms-ss2024_cleaned.csv +# processed/data_rdm-ms-ss2024_cleaned.RData +# +# Created: 2024-06-03, NW + \end{lstlisting} + \item These metadata help you remember faster what you did + \item Might not be necessary when using consistent version control (but does + not hurt either) + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Line length} + {} + \begin{center} + {\Large\bf Keep lines to 80 characters or less!} + \end{center} + \begin{lstlisting} +# Good +my_df <- data.frame(n = 1:3, + letter = c("a", "b", "c"), + cap_letter = c("A", "B", "C")) + +# Bad +my_df <- data.frame(n = 1:3, letter = c("a", "b", "c"), cap_letter = c("A", "B", "C")) + \end{lstlisting} + \begin{itemize} + \item Ensures that your code is formatted in a way that you can see all of + the code without scrolling horizontally + \item To set your script pane to be limited to 80 characters, go to\\ + \verb+RStudio -> Preferences -> Code -> Display+\\ + and set ``Margin Column'' to 80 + \end{itemize} +\end{frame} + +\begin{frame}[fragile, allowframebreaks]{File organisation} + \begin{itemize} + \item Try to write scripts that are concerned with one (major) task + \item If you can find a name, that captures the content, it is usually a + good way to start + \item Some (random) examples +\begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}] +download-data.R +data-cleaning.R +cluster_analysis_exp1.R +visualization_logistic-model.R +anova_h1.R + \end{lstlisting} + \framebreak + + \item Export data sets for new scripts (do not make yourself run all scripts + up to script 5 each time, just because you need the data in a certain + format) + \begin{lstlisting} +# Interoperable +write.table(dat, + file = "data_exp1_cleaned.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + +# Preserve order of factor levels, date formats, etc. +save(dat, file = "data_exp1_cleaned.RData") + \end{lstlisting} + \end{itemize} +\end{frame} + +\begin{frame}[fragile, allowframebreaks]{Internal structure} + \begin{itemize} + \item Use commented lines with \texttt{-} or \texttt{=} to break your file + up into chunks + \item Load additional packages at the beginning of the script + \begin{lstlisting} +library(lme4) +library(sjPlot) + +# Load data --------------------------- + +# Plot data --------------------------- + \end{lstlisting} + \framebreak + + \item If you load several packages, be aware that the order of loading + matters! + \item If you use only one or two functions from a package, get the function + with \verb+::+ instead of loading the whole package + \begin{lstlisting} +library(lme4) +... + +# Fit mixed-effects model to test Hypothesis 1 +lme1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) +summary(lme1) +sjPlot::tab_model(lme1) + \end{lstlisting} + \framebreak + + \item Group related pieces of code together + \item Separate blocks of code by empty spaces + \begin{lstlisting} +# Load data +library(faraway) +data(nepali) + +# Relabel sex variable +nepali$sex <- factor(nepali$sex, + levels = c(1, 2), + labels = c("Male", "Female")) + \end{lstlisting} + \end{itemize} +\end{frame} + + + +\begin{frame}{How can I test if my code is reproducible?} + % slido + \centering + \includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 5} + + \url{https://app.sli.do/event/uEz8fJWkLBNm1sthQovXNH} +\end{frame} + +\section{Code reviews} + +\begin{frame}[<+->]{Use your peers} + \begin{itemize} + \item Do not overthink it! + \item Just give your data and code to a colleague and ask them to reproduce + what you did (this sounds easy, but it is actually not!) + \item This will give you tons of insights about your workflow + \begin{itemize} + \item Can this person (in general) understand what you did? + \item Is this person able to easily put your data on their machine and + run the code right away? + \item Anything this person would have done differently? + \item Discuss why and which things you do differently + \end{itemize} + \item Reading other peoples's code is the best way to learn about how things + can be done differently than you do them + \item You can review code by printing it out and adding comments by hand\\ + (I highly recommend this!) + \end{itemize} +\end{frame} + +\appendix +%\begin{frame}[allowframebreaks]{References} +\begin{frame}{References} +%\renewcommand{\bibfont}{\small} + \printbibliography +\vfill +\end{frame} + +\end{document} + diff --git a/figures/QR Code for Methodenseminar SS 2024 - Session 5.png b/figures/QR Code for Methodenseminar SS 2024 - Session 5.png new file mode 100644 index 0000000..0060507 Binary files /dev/null and b/figures/QR Code for Methodenseminar SS 2024 - Session 5.png differ diff --git a/literature/lit.bib b/literature/lit.bib index 557f5e0..7335aa3 100644 --- a/literature/lit.bib +++ b/literature/lit.bib @@ -1,3 +1,11 @@ +@book{Anderson2023, + title = {R programming for research}, + author = {Brooke Anderson and Rachel Severson and Nicholas Good}, + year = {2023}, + publisher = {Colorado State University, ERHS 535}, + url = {https://geanders.github.io/RProgrammingForResearch/} +} + @article{Kathawalla2021, title = {Easing into open science: {A} guide for graduate students and their advisors}, author = {Kathawalla, Ummul-Kiram and Silverstein, Priya and Syed, Moin}, @@ -9,11 +17,11 @@ } @book{Koeller2004, - title = {Wege zur {H}ochschulreife in {B}aden-{W}{\"u}rttemberg: {TOSCA} -- {E}ine {U}ntersuchung an allgemein bildenden und beruflichen {G}ymnasien}, - author = {K{\"o}ller, Olaf and Watermann, Ralf and Trautwein, Ulrich and L{\"u}dtke, Oliver}, - year = {2004}, + title = {Wege zur {H}ochschulreife in {B}aden-{W}{\"u}rttemberg: {TOSCA} -- {E}ine {U}ntersuchung an allgemein bildenden und beruflichen {G}ymnasien}, + author = {K{\"o}ller, Olaf and Watermann, Ralf and Trautwein, Ulrich and L{\"u}dtke, Oliver}, + year = {2004}, publisher = {Springer}, - doi = {10.1007/978-3-322-80906-3} + doi = {10.1007/978-3-322-80906-3} } @article{Lowndes2017, @@ -51,14 +59,20 @@ } @article{Wicherts2012, - title = {Publish (your data) or (let the data) perish! {W}hy not publish your data too?}, - author = {Wicherts, Jelte M and Bakker, Marjan}, - journal = {Intelligence}, - volume = {40}, - number = {2}, - pages = {73--76}, - year = {2012}, - doi = {10.1016/j.intell.2012.01.004} + title = {Publish (your data) or (let the data) perish! {W}hy not publish your data too?}, + author = {Wicherts, Jelte M and Bakker, Marjan}, + journal = {Intelligence}, + volume = {40}, + number = {2}, + pages = {73--76}, + year = {2012}, + doi = {10.1016/j.intell.2012.01.004} +} + +@misc{Wickham_styleguide, + author = {Hadley Wickham}, + title = {The tidyverse style guide}, + url = {https://style.tidyverse.org/} } @misc{Wilbrandt2023,