Slides for fifth session
This commit is contained in:
parent
66c9711d45
commit
7a2327aba3
503
05_clean_coding/05_clean_coding.tex
Normal file
503
05_clean_coding/05_clean_coding.tex
Normal file
@ -0,0 +1,503 @@
|
||||
\documentclass[aspectratio=169]{beamer}
|
||||
|
||||
\usepackage{listings}
|
||||
%\usepackage[utf8]{inputenc}
|
||||
\usepackage[style = apa, backend = biber, natbib = true]{biblatex}
|
||||
\addbibresource{../literature/lit.bib}
|
||||
|
||||
\usepackage{fancyvrb}
|
||||
\usepackage{fontawesome5} % get icons
|
||||
\usepackage{multirow}
|
||||
\usepackage{color, colortbl}
|
||||
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{fit}
|
||||
\usepackage[edges]{forest}
|
||||
|
||||
\lstset{language = R,%
|
||||
basicstyle = \ttfamily\color{iwmgray},
|
||||
frame = single,
|
||||
rulecolor = \color{iwmgray},
|
||||
commentstyle = \slshape\color{iwmgreen},
|
||||
keywordstyle = \bfseries\color{iwmgray},
|
||||
identifierstyle = \color{iwmpurple},
|
||||
stringstyle = \color{iwmblue},
|
||||
numbers = none,%left,numberstyle = \tiny,
|
||||
basewidth = {.5em, .4em},
|
||||
showstringspaces = false,
|
||||
emphstyle = \color{red!50!white}}
|
||||
|
||||
\makeatletter \def\newblock{\beamer@newblock} \makeatother
|
||||
|
||||
\beamertemplatenavigationsymbolsempty
|
||||
\setbeamertemplate{itemize items}[circle]
|
||||
\setbeamertemplate{section in toc}[circle]
|
||||
\mode<beamer>{\setbeamercolor{math text displayed}{fg=iwmgray}}
|
||||
\setbeamercolor{block body}{bg=iwmorange!50!white}
|
||||
\setbeamercolor{block title}{fg=white, bg=iwmorange}
|
||||
% Definitions for biblatex
|
||||
\setbeamercolor{bibliography entry note}{fg=iwmgray}
|
||||
\setbeamercolor{bibliography entry author}{fg=iwmgray}
|
||||
\setbeamertemplate{bibliography item}{}
|
||||
|
||||
\definecolor{iwmorange}{RGB}{255,105,0}
|
||||
\definecolor{iwmgray}{RGB}{67,79,79}
|
||||
\definecolor{iwmblue}{RGB}{60,180,220}
|
||||
\definecolor{iwmgreen}{RGB}{145,200,110}
|
||||
\definecolor{iwmpurple}{RGB}{120,0,75}
|
||||
|
||||
\setbeamercolor{title}{fg=iwmorange}
|
||||
\setbeamercolor{frametitle}{fg=iwmorange}
|
||||
\setbeamercolor{structure}{fg=iwmorange}
|
||||
\setbeamercolor{normal text}{fg=iwmgray}
|
||||
\setbeamercolor{author}{fg=iwmgray}
|
||||
\setbeamercolor{date}{fg=iwmgray}
|
||||
|
||||
\newcommand{\vect}[1]{\mathbf{#1}}
|
||||
\newcommand{\mat}[1]{\mathbf{#1}}
|
||||
\newcommand{\gvect}[1]{\boldsymbol{#1}}
|
||||
\newcommand{\gmat}[1]{\boldsymbol{#1}}
|
||||
|
||||
\AtBeginSection[]{
|
||||
\frame{
|
||||
\tableofcontents[sectionstyle=show/hide, subsectionstyle=show/show/hide]}}
|
||||
|
||||
\setbeamertemplate{headline}{
|
||||
\begin{beamercolorbox}{section in head}
|
||||
\vskip5pt\insertsectionnavigationhorizontal{\paperwidth}{}{}\vskip2pt
|
||||
\end{beamercolorbox}
|
||||
}
|
||||
|
||||
\setbeamertemplate{footline}{\vskip-2pt\hfill\insertframenumber$\;$\vskip2pt}
|
||||
|
||||
\title{Clean coding}
|
||||
\author{Nora Wickelmaier}
|
||||
\date{July 8, 2024}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{frame}{}
|
||||
\thispagestyle{empty}
|
||||
\titlepage
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{What is needed to make code reproducible?}
|
||||
% slido
|
||||
\centering
|
||||
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 5}
|
||||
|
||||
\url{https://app.sli.do/event/uEz8fJWkLBNm1sthQovXNH}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]{Programming resources}
|
||||
\footnotesize
|
||||
\begin{tabular}{ll}
|
||||
Learning statistics with R & {\url{https://learningstatisticswithr.com/book/}} \\
|
||||
&\\
|
||||
R for Data Science & {\url{https://r4ds.hadley.nz/}} \\
|
||||
&\\
|
||||
Advanced R & {\url{https://adv-r.hadley.nz/}} \\
|
||||
&\\
|
||||
Happy Git and GitHub for the useR & {\url{https://happygitwithr.com/}} \\
|
||||
&\\
|
||||
R Programming for Research & {\url{https://geanders.github.io/RProgrammingForResearch/}} \\
|
||||
&\\
|
||||
Building reproducible analytical pipelines with R & {\url{https://raps-with-r.dev/}} \\
|
||||
&\\
|
||||
Data Skills for Reproducible Science & {\url{https://psyteachr.github.io/msc-data-skills/}} \\
|
||||
\end{tabular}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Agenda}
|
||||
\centering
|
||||
\begin{tabular}{ll}
|
||||
\hline
|
||||
Date & Topic \\
|
||||
\hline
|
||||
2024-05-13 & Introduction to data management \\
|
||||
2024-05-27 & Workflow \\
|
||||
2024-06-10 & Data organisation\\
|
||||
2024-06-24 & Data sharing \\
|
||||
\only<1>{2024-07-08}\only<2>{\bf 2024-07-08} &
|
||||
\only<1>{Clean coding}\only<2>{\bf Clean coding} \\
|
||||
2024-07-22 & Version control \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{frame}
|
||||
|
||||
% Understandable coding
|
||||
% Cleaning up R code for readability
|
||||
% Documentation of a final R script
|
||||
% Reproducible code
|
||||
|
||||
\section{Style guidelines}
|
||||
|
||||
\begin{frame}[<+->]{Style guidelines in R}
|
||||
\begin{itemize}
|
||||
\item R has no mandatory or commonly accepted style guide
|
||||
\item However, Hadley Wickham and Google developed style guides which are
|
||||
now widely accepted
|
||||
\begin{itemize}
|
||||
\item \url{https://google.github.io/styleguide/Rguide.html}
|
||||
\item \url{https://style.tidyverse.org/}
|
||||
\end{itemize}
|
||||
\item It is always a good idea to follow a style guide and not ``create''
|
||||
your own rules (if you deviate, be consistent!)
|
||||
\item A style guide helps with
|
||||
\begin{itemize}
|
||||
\item Keeping code clean which is easier to read and interpret
|
||||
\item Making it easier to catch and fix mistakes
|
||||
\item Making it easier for others to follow and adapt your code
|
||||
\item Preventing possible problems, e.\,g., avoiding dots in function
|
||||
names
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\nocite{Wickham_styleguide, Anderson2023}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile, allowframebreaks]{File names}
|
||||
\begin{itemize}
|
||||
\item File names should be meaningful and end in .R
|
||||
\item Avoid using special characters in file names
|
||||
\item Stick with numbers, letters, \verb+-+, and \verb+_+
|
||||
\begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}]
|
||||
# Good
|
||||
fit_models.R
|
||||
utility_functions.R
|
||||
|
||||
# Bad
|
||||
fit models.R
|
||||
foo.r
|
||||
stuff.r
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item If files should be run in a particular order, prefix them with numbers
|
||||
\item If it seems likely you’ll have more than 10 files, left pad with zero
|
||||
\begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}]
|
||||
00_download.R
|
||||
01_explore.R
|
||||
...
|
||||
09_model.R
|
||||
10_visualize.R
|
||||
\end{lstlisting}
|
||||
\item If you later realize that you missed some steps, it’s tempting to use
|
||||
02a, 02b, etc.
|
||||
\item However, it is generally better to bite the bullet and rename all
|
||||
files
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile, allowframebreaks]{Object names}
|
||||
\begin{itemize}
|
||||
\item Variable and function names should use only lowercase letters,
|
||||
numbers, and \verb+_+
|
||||
\item Use underscores (\verb+_+) (so called snake case) to separate words
|
||||
within a name
|
||||
\begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}]
|
||||
# Good
|
||||
day_one
|
||||
day_1
|
||||
|
||||
# Bad
|
||||
DayOne
|
||||
dayone
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Generally, variable names should be nouns and function names should be
|
||||
verbs
|
||||
\item Strive for names that are concise and meaningful
|
||||
\begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}]
|
||||
# Good
|
||||
day_one
|
||||
|
||||
# Bad
|
||||
first_day_of_the_month
|
||||
djm1
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Avoid re-using names of common functions and variables
|
||||
\begin{lstlisting}
|
||||
# Bad
|
||||
T <- FALSE
|
||||
c <- 10
|
||||
mean <- function(x) sum(x)
|
||||
\end{lstlisting}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile, allowframebreaks]{Spacing}
|
||||
\begin{itemize}
|
||||
\item Always put a space after a comma, never before
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
x[, 1]
|
||||
|
||||
# Bad
|
||||
x[,1]
|
||||
x[ ,1]
|
||||
x[ , 1]
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Do not put spaces inside or outside parentheses for regular function
|
||||
calls
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
mean(x, na.rm = TRUE)
|
||||
|
||||
# Bad
|
||||
mean (x, na.rm = TRUE)
|
||||
mean( x, na.rm = TRUE )
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Place a space before and after \texttt{()} when used with \texttt{if},
|
||||
\texttt{for}, or \texttt{while}
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
if (debug) {
|
||||
show(x)
|
||||
}
|
||||
|
||||
# Bad
|
||||
if(debug){
|
||||
show(x)
|
||||
}
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Place a space after \texttt{()} used for function arguments
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
function(x) {}
|
||||
|
||||
# Bad
|
||||
function (x) {}
|
||||
function(x){}
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Most infix operators (\verb+==+, \verb|+|, \verb+-+, \verb+<-+, etc.)
|
||||
should always be surrounded by spaces
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
height <- (feet * 12) + inches
|
||||
mean(x, na.rm = TRUE)
|
||||
|
||||
# Bad
|
||||
height<-feet*12+inches
|
||||
mean(x, na.rm=TRUE)
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item There are a few exceptions, which should never be surrounded by
|
||||
spaces: \verb+::+, \verb+:::+, \verb+$+, \verb+@+, \verb+[+, \verb+[[+,
|
||||
\verb+?+, \verb+^+, and \verb+:+
|
||||
{\small
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
sqrt(x^2 + y^2)
|
||||
df$z
|
||||
x <- 1:10
|
||||
package?stats
|
||||
?mean
|
||||
|
||||
# Bad
|
||||
sqrt(x ^ 2 + y ^ 2)
|
||||
df $ z
|
||||
x <- 1 : 10
|
||||
package ? stats
|
||||
? mean
|
||||
\end{lstlisting}
|
||||
}
|
||||
\item Adding extra spaces is ok if it improves alignment of \verb+=+ or
|
||||
\verb+<-+
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
list(
|
||||
total = a + b + c,
|
||||
mean = (a + b + c) / n
|
||||
)
|
||||
|
||||
# Also fine
|
||||
list(
|
||||
total = a + b + c,
|
||||
mean = (a + b + c) / n
|
||||
)
|
||||
\end{lstlisting}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% CITE:
|
||||
% https://style.tidyverse.org/index.html
|
||||
% R Programming for Reserach: https://geanders.github.io/RProgrammingForResearch/
|
||||
% Building reproducible analytical pipelines with R: https://raps-with-r.dev/
|
||||
|
||||
\section{Script organisation}
|
||||
|
||||
\begin{frame}[fragile]{Script header}
|
||||
\begin{itemize}
|
||||
\item It can be very helpful to have some general information right at the
|
||||
top when opening a script
|
||||
\begin{lstlisting}
|
||||
# 01_preprocessing.R
|
||||
#
|
||||
# Cleaning up toy data set (Methods Seminar SS2024)
|
||||
#
|
||||
# Input: rawdata/RDM_MS_SS2024_download_2024-06-07.csv
|
||||
# Output: processed/data_rdm-ms-ss2024_cleaned.csv
|
||||
# processed/data_rdm-ms-ss2024_cleaned.RData
|
||||
#
|
||||
# Created: 2024-06-03, NW
|
||||
\end{lstlisting}
|
||||
\item These metadata help you remember faster what you did
|
||||
\item Might not be necessary when using consistent version control (but does
|
||||
not hurt either)
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]{Line length}
|
||||
{}
|
||||
\begin{center}
|
||||
{\Large\bf Keep lines to 80 characters or less!}
|
||||
\end{center}
|
||||
\begin{lstlisting}
|
||||
# Good
|
||||
my_df <- data.frame(n = 1:3,
|
||||
letter = c("a", "b", "c"),
|
||||
cap_letter = c("A", "B", "C"))
|
||||
|
||||
# Bad
|
||||
my_df <- data.frame(n = 1:3, letter = c("a", "b", "c"), cap_letter = c("A", "B", "C"))
|
||||
\end{lstlisting}
|
||||
\begin{itemize}
|
||||
\item Ensures that your code is formatted in a way that you can see all of
|
||||
the code without scrolling horizontally
|
||||
\item To set your script pane to be limited to 80 characters, go to\\
|
||||
\verb+RStudio -> Preferences -> Code -> Display+\\
|
||||
and set ``Margin Column'' to 80
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile, allowframebreaks]{File organisation}
|
||||
\begin{itemize}
|
||||
\item Try to write scripts that are concerned with one (major) task
|
||||
\item If you can find a name, that captures the content, it is usually a
|
||||
good way to start
|
||||
\item Some (random) examples
|
||||
\begin{lstlisting}[identifierstyle = \bfseries\color{iwmgray}]
|
||||
download-data.R
|
||||
data-cleaning.R
|
||||
cluster_analysis_exp1.R
|
||||
visualization_logistic-model.R
|
||||
anova_h1.R
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Export data sets for new scripts (do not make yourself run all scripts
|
||||
up to script 5 each time, just because you need the data in a certain
|
||||
format)
|
||||
\begin{lstlisting}
|
||||
# Interoperable
|
||||
write.table(dat,
|
||||
file = "data_exp1_cleaned.csv",
|
||||
sep = ";",
|
||||
quote = FALSE,
|
||||
row.names = FALSE)
|
||||
|
||||
# Preserve order of factor levels, date formats, etc.
|
||||
save(dat, file = "data_exp1_cleaned.RData")
|
||||
\end{lstlisting}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile, allowframebreaks]{Internal structure}
|
||||
\begin{itemize}
|
||||
\item Use commented lines with \texttt{-} or \texttt{=} to break your file
|
||||
up into chunks
|
||||
\item Load additional packages at the beginning of the script
|
||||
\begin{lstlisting}
|
||||
library(lme4)
|
||||
library(sjPlot)
|
||||
|
||||
# Load data ---------------------------
|
||||
|
||||
# Plot data ---------------------------
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item If you load several packages, be aware that the order of loading
|
||||
matters!
|
||||
\item If you use only one or two functions from a package, get the function
|
||||
with \verb+::+ instead of loading the whole package
|
||||
\begin{lstlisting}
|
||||
library(lme4)
|
||||
...
|
||||
|
||||
# Fit mixed-effects model to test Hypothesis 1
|
||||
lme1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
|
||||
summary(lme1)
|
||||
sjPlot::tab_model(lme1)
|
||||
\end{lstlisting}
|
||||
\framebreak
|
||||
|
||||
\item Group related pieces of code together
|
||||
\item Separate blocks of code by empty spaces
|
||||
\begin{lstlisting}
|
||||
# Load data
|
||||
library(faraway)
|
||||
data(nepali)
|
||||
|
||||
# Relabel sex variable
|
||||
nepali$sex <- factor(nepali$sex,
|
||||
levels = c(1, 2),
|
||||
labels = c("Male", "Female"))
|
||||
\end{lstlisting}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
|
||||
\begin{frame}{How can I test if my code is reproducible?}
|
||||
% slido
|
||||
\centering
|
||||
\includegraphics[width = 5cm]{../figures/QR Code for Methodenseminar SS 2024 - Session 5}
|
||||
|
||||
\url{https://app.sli.do/event/uEz8fJWkLBNm1sthQovXNH}
|
||||
\end{frame}
|
||||
|
||||
\section{Code reviews}
|
||||
|
||||
\begin{frame}[<+->]{Use your peers}
|
||||
\begin{itemize}
|
||||
\item Do not overthink it!
|
||||
\item Just give your data and code to a colleague and ask them to reproduce
|
||||
what you did (this sounds easy, but it is actually not!)
|
||||
\item This will give you tons of insights about your workflow
|
||||
\begin{itemize}
|
||||
\item Can this person (in general) understand what you did?
|
||||
\item Is this person able to easily put your data on their machine and
|
||||
run the code right away?
|
||||
\item Anything this person would have done differently?
|
||||
\item Discuss why and which things you do differently
|
||||
\end{itemize}
|
||||
\item Reading other peoples's code is the best way to learn about how things
|
||||
can be done differently than you do them
|
||||
\item You can review code by printing it out and adding comments by hand\\
|
||||
(I highly recommend this!)
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\appendix
|
||||
%\begin{frame}[allowframebreaks]{References}
|
||||
\begin{frame}{References}
|
||||
%\renewcommand{\bibfont}{\small}
|
||||
\printbibliography
|
||||
\vfill
|
||||
\end{frame}
|
||||
|
||||
\end{document}
|
||||
|
BIN
figures/QR Code for Methodenseminar SS 2024 - Session 5.png
Normal file
BIN
figures/QR Code for Methodenseminar SS 2024 - Session 5.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.7 KiB |
@ -1,3 +1,11 @@
|
||||
@book{Anderson2023,
|
||||
title = {R programming for research},
|
||||
author = {Brooke Anderson and Rachel Severson and Nicholas Good},
|
||||
year = {2023},
|
||||
publisher = {Colorado State University, ERHS 535},
|
||||
url = {https://geanders.github.io/RProgrammingForResearch/}
|
||||
}
|
||||
|
||||
@article{Kathawalla2021,
|
||||
title = {Easing into open science: {A} guide for graduate students and their advisors},
|
||||
author = {Kathawalla, Ummul-Kiram and Silverstein, Priya and Syed, Moin},
|
||||
@ -61,6 +69,12 @@
|
||||
doi = {10.1016/j.intell.2012.01.004}
|
||||
}
|
||||
|
||||
@misc{Wickham_styleguide,
|
||||
author = {Hadley Wickham},
|
||||
title = {The tidyverse style guide},
|
||||
url = {https://style.tidyverse.org/}
|
||||
}
|
||||
|
||||
@misc{Wilbrandt2023,
|
||||
author = {Wilbrandt, Jeanne},
|
||||
title = {{Research Data Management Intro Series: Coffee Lectures \& Espresso Shots}},
|
||||
|
Loading…
Reference in New Issue
Block a user