Dplyr beyond the basics

Julian Sagebiel & Nino Cavallaro

Introduction to dplyr

library(dplyr)
  • R package for data manipulation

  • Part of tidyverse

  • Provides a nicer workflow than base R

  • Provides useful functions such as mutate, summarize, group_by, filter, select etc.

  • Enables multiple data manipulations in one operation using pipes %>%

Introduction to purrr

library(purrr)
  • Also part of tidyverse

  • Package to work with functions and vectors

  • Allows to replace loops and make code more tidy and easier to read

  • Most prominent function: map

Example data set

  • We use panel data of a survey on urban green spaces in different German cities
rm(list=ls())

library(stringr)
library(dplyr)
library(tidylog)
library(purrr)


all_database <- readRDS("data/all_database.rds")

The data in detail

'data.frame':   45909 obs. of  69 variables:
 $ id                  : num  15169 15169 15169 15169 15169 ...
 $ choice              : num  1 1 1 1 1 1 1 2 1 3 ...
 $ DESIGN_ROW          : num  6 7 10 16 25 26 28 29 33 3 ...
 $ BAEUME.1            : num  9.03 5.03 5.03 7.03 7.03 ...
 $ BAEUME.2            : num  7.03 7.03 9.03 5.03 9.03 ...
 $ BAEUME.3            : num  5.03 5.03 5.03 5.03 5.03 ...
 $ GRUENFLAECHEN.1     : num  34 34 44 39 44 39 44 44 39 39 ...
 $ GRUENFLAECHEN.2     : num  39 39 34 44 34 34 34 34 44 34 ...
 $ GRUENFLAECHEN.3     : num  34 34 34 34 34 34 34 34 34 34 ...
 $ NATURNAH.1          : num  35 35 25 35 45 25 45 25 35 35 ...
 $ NATURNAH.2          : num  45 45 45 45 25 35 25 35 45 45 ...
 $ NATURNAH.3          : num  25 25 25 25 25 25 25 25 25 25 ...
 $ WEGE.1              : num  40 40 40 30 40 30 30 50 40 50 ...
 $ WEGE.2              : num  30 50 30 50 50 50 50 30 50 40 ...
 $ WEGE.3              : num  30 30 30 30 30 30 30 30 30 30 ...
 $ BEITRAG.1           : num  6 36 120 12 6 120 12 240 60 36 ...
 $ BEITRAG.2           : num  60 120 240 36 12 240 120 120 120 6 ...
 $ BEITRAG.3           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ i_NUMBER            : num  15169 15169 15169 15169 15169 ...
  ..- attr(*, "label")= chr "Interviewnummer"
  ..- attr(*, "format.spss")= chr "F11.0"
  ..- attr(*, "display_width")= int 13
 $ i_TIME              : num  589 589 589 589 589 589 589 589 589 825 ...
  ..- attr(*, "label")= chr "Interviewdauer [sec]"
  ..- attr(*, "format.spss")= chr "F11.0"
  ..- attr(*, "display_width")= int 13
 $ Gender              : num  1 1 1 1 1 1 1 1 1 2 ...
  ..- attr(*, "label")= chr "Geschlecht"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 4
  ..- attr(*, "labels")= Named num [1:4] 1 2 3 4
  .. ..- attr(*, "names")= chr [1:4] "Männlich" "Weiblich" "Divers" "keine Angabe"
 $ Education           : num  7 7 7 7 7 7 7 7 7 6 ...
  ..- attr(*, "label")= chr "höchster Bildungsabschluss"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 4
  ..- attr(*, "labels")= Named num [1:8] 1 2 3 4 5 6 7 8
  .. ..- attr(*, "names")= chr [1:8] "Noch Schüler/in" "Schule beendet ohne Schulabschluss" "Volks-/Hauptschulabschluss" "Mittlere Reife, Realschulabschluss" ...
 $ Income              : num  1 1 1 1 1 1 1 1 1 2 ...
  ..- attr(*, "label")= chr "monatliches Haushaltsnettoeinkommen"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 4
  ..- attr(*, "labels")= Named num [1:7] 1 2 3 4 5 6 7
  .. ..- attr(*, "names")= chr [1:7] "unter EUR 1.000" "EUR 1.000 bis unter EUR 2.000" "EUR 2.000 bis unter EUR 3.000" "EUR 3.000 bis unter EUR 4.000" ...
 $ Stadt               : num  23 23 23 23 23 23 23 23 23 23 ...
  ..- attr(*, "label")= chr "Wohnort"
  ..- attr(*, "format.spss")= chr "F2.0"
  ..- attr(*, "display_width")= int 7
  ..- attr(*, "labels")= Named num [1:23] 1 2 3 4 5 6 7 8 9 10 ...
  .. ..- attr(*, "names")= chr [1:23] "Berlin" "Hamburg" "München" "Köln" ...
 $ PLZ                 : num  86153 86153 86153 86153 86153 ...
  ..- attr(*, "label")= chr "Postleitzahl"
  ..- attr(*, "format.spss")= chr "F5.0"
  ..- attr(*, "display_width")= int 9
 $ estimated_Trees     : num  15 15 15 15 15 15 15 15 15 5 ...
  ..- attr(*, "label")= chr "Straßenbäume: geschätzte existierende Anzahl pro 100 Meter Straße"
  ..- attr(*, "format.spss")= chr "F12.1"
  ..- attr(*, "display_width")= int 14
 $ estimated_Greenspace: num  30 30 30 30 30 30 30 30 30 10 ...
  ..- attr(*, "label")= chr "Grünflächen: geschätzter existierender Anteil der Fläche in der näheren Wohnumgebung"
  ..- attr(*, "format.spss")= chr "F12.1"
  ..- attr(*, "display_width")= int 14
 $ estimated_Natural   : num  30 30 30 30 30 30 30 30 30 10 ...
  ..- attr(*, "label")= chr "naturnahe Flächen: geschätzter existierender Anteil an den Grünflächen in der näheren Wohnumgebung"
  ..- attr(*, "format.spss")= chr "F12.1"
  ..- attr(*, "display_width")= int 14
 $ Block               : num  3 3 3 3 3 3 3 3 3 4 ...
  ..- attr(*, "label")= chr "Anzuzeigender Block:"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 7
  ..- attr(*, "labels")= Named num [1:4] 1 2 3 4
  .. ..- attr(*, "names")= chr [1:4] "Block 1" "Block 2" "Block 3" "Block 4"
 $ Balcony             : num  0 0 0 0 0 0 0 0 0 1 ...
  ..- attr(*, "label")= chr "Wohnausstattung: Balkon oder Terrasse"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 10
  ..- attr(*, "labels")= Named num [1:2] 0 1
  .. ..- attr(*, "names")= chr [1:2] "Nicht ausgewählt" "ausgewählt"
 $ Garden              : num  1 1 1 1 1 1 1 1 1 0 ...
  ..- attr(*, "label")= chr "Wohnausstattung: Garten"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 10
  ..- attr(*, "labels")= Named num [1:2] 0 1
  .. ..- attr(*, "names")= chr [1:2] "Nicht ausgewählt" "ausgewählt"
 $ Flat_size           : num  62 62 62 62 62 62 62 62 62 74 ...
  ..- attr(*, "label")= chr "Wohnfläche von Wohnung oder Haus (qm)"
  ..- attr(*, "format.spss")= chr "F12.3"
  ..- attr(*, "display_width")= int 14
  ..- attr(*, "labels")= Named num 9999
  .. ..- attr(*, "names")= chr "keine Angabe"
 $ HH_size             : num  1 1 1 1 1 1 1 1 1 3 ...
  ..- attr(*, "label")= chr "Anzahl von Personen im Haushalt"
  ..- attr(*, "format.spss")= chr "F2.0"
  ..- attr(*, "display_width")= int 6
  ..- attr(*, "labels")= Named num 99
  .. ..- attr(*, "names")= chr "keine Angabe"
 $ Num_Kids            : num  0 0 0 0 0 0 0 0 0 1 ...
  ..- attr(*, "label")= chr "Anzahl minderjähriger Personen im Haushalt"
  ..- attr(*, "format.spss")= chr "F1.0"
  ..- attr(*, "display_width")= int 6
  ..- attr(*, "labels")= Named num 99
  .. ..- attr(*, "names")= chr "keine Angabe"
 $ Age                 : num  59 59 59 59 59 59 59 59 59 69 ...
  ..- attr(*, "label")= chr "Geburtsjahr"
  ..- attr(*, "format.spss")= chr "F2.0"
  ..- attr(*, "display_width")= int 4
  ..- attr(*, "labels")= Named num [1:95] 1 2 3 4 5 6 7 8 9 10 ...
  .. ..- attr(*, "names")= chr [1:95] "2003" "2002" "2001" "2000" ...
 $ City                : chr  "Augsburg" "Augsburg" "Augsburg" "Augsburg" ...
 $ wohn_lon            : num  10.9 10.9 10.9 10.9 10.9 ...
 $ wohn_lat            : num  48.4 48.4 48.4 48.4 48.4 ...
 $ park_lon            : num  10.9 10.9 10.9 10.9 10.9 ...
 $ park_lat            : num  48.4 48.4 48.4 48.4 48.4 ...
 $ parkort             : num  1 1 1 1 1 1 1 1 1 1 ...
 $ wohnort             : num  1 1 1 1 1 1 1 1 1 1 ...
 $ SQ_Gruenflaechen    : num  34 34 34 34 34 34 34 34 34 34 ...
 $ SQ_Baeume           : num  5.03 5.03 5.03 5.03 5.03 ...
 $ SQ_naturnah         : num  25 25 25 25 25 25 25 25 25 25 ...
 $ SQ_Wege             : num  30 30 30 30 30 30 30 30 30 30 ...
 $ Population          : num  299021 299021 299021 299021 299021 ...
 $ Pop_density         : num  18.4 18.4 18.4 18.4 18.4 ...
 $ mean_HHsize         : num  2.01 2.01 2.01 2.01 2.01 2.01 2.01 2.01 2.01 2.01 ...
 $ mean_Age            : num  44.3 44.3 44.3 44.3 44.3 ...
 $ mean_Income         : num  1600 1600 1600 1600 1600 ...
 $ Hot_days            : num  8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 ...
 $ Ost_West            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ mean_Einkommen_level: num  1.6 1.6 1.6 1.6 1.6 ...
 $ Ref_Baeume          : num  4 4 4 4 4 4 4 4 4 4 ...
 $ Ref_Gruenflaechen   : num  27 27 27 27 27 27 27 27 27 27 ...
 $ Ref_naturnah        : num  18 18 18 18 18 18 18 18 18 18 ...
 $ Ref_Wege            : num  41 41 41 41 41 41 41 41 41 41 ...
 $ Ref_Einkommen       : num  1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 ...
 $ Ref_Alter           : num  45 45 45 45 45 45 45 45 45 45 ...
 $ Ref_HHGroesse       : num  2 2 2 2 2 2 2 2 2 2 ...
 $ Ref_Hitzetage       : num  11 11 11 11 11 11 11 11 11 11 ...
 $ Ref_Einwohner       : num  750000 750000 750000 750000 750000 750000 750000 750000 750000 750000 ...
 $ Ref_Einwohnerdichte : num  23 23 23 23 23 23 23 23 23 23 ...
 $ Befragungsjahr      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Rekrutierung        : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Einwohner_Zelle     : num  0 0 0 0 0 0 0 0 0 88 ...
 $ GF_Anteil           : num  16.7 16.7 16.7 16.7 16.7 16.7 16.7 16.7 16.7 12.5 ...
 $ Uni_degree          : num  1 1 1 1 1 1 1 1 1 0 ...

Automatisation

  • Once we repeat code with different variables, datasets etc. we want to avoid repeating each step manually.

  • Good practice to use functions and loops to repeat code

General principle: Never use the same code twice.

Easy Example

  • We want to rename some German variables to English
[1] "SQ_Baeume"         "Ref_Baeume"        "SQ_Gruenflaechen" 
[4] "Ref_Gruenflaechen"
  • Lets rename them to English
easydata <- all_database %>% 
  rename(SQ_Trees = "SQ_Baeume",           Ref_Trees = "Ref_Baeume" ,
         SQ_Green =  "SQ_Gruenflaechen" , Ref_Green = "Ref_Gruenflaechen")
rename: renamed 4 variables (SQ_Green, SQ_Trees, Ref_Trees, Ref_Green)
  • Lots of typing
  • Prone to human error

Saver example

  • We can use rename_with to automate the process
savedata <- all_database %>% 
  rename_with(~ str_replace_all(.,c("Baeume" = "Trees", 
                                  "Gruenflaechen" = "Green")),
              everything()
              )
rename_with: renamed 4 variables (SQ_Green, SQ_Trees, Ref_Trees, Ref_Green)
  • Less typing and less chance of human error

Datasets are the same

  • Check if both dataframes are the same
identical(savedata,easydata)
[1] TRUE

Using mutate

  • Automatisation can also be easily done in mutate using across.

  • Some variables should be rounded

  BAEUME.1 BAEUME.2 BAEUME.3
1 9.032278 7.032278 5.032278
2 5.032278 7.032278 5.032278
3 5.032278 9.032278 5.032278
4 7.032278 5.032278 5.032278
5 7.032278 9.032278 5.032278
6 5.032278 7.032278 5.032278

Easy way

easydata <- easydata %>% 
  mutate(BAEUME.1 = as.integer(BAEUME.1),
         BAEUME.2 = as.integer(BAEUME.2),
         BAEUME.3 = as.integer(BAEUME.3))


easydata %>% select(starts_with("BAEUME")) %>% head()
  BAEUME.1 BAEUME.2 BAEUME.3
1        9        7        5
2        5        7        5
3        5        9        5
4        7        5        5
5        7        9        5
6        5        7        5

Save way

savedata <- savedata %>% 
  mutate(across(starts_with("BAEUME"), 
    as.integer))


savedata %>% select(starts_with("BAEUME")) %>% head()
  BAEUME.1 BAEUME.2 BAEUME.3
1        9        7        5
2        5        7        5
3        5        9        5
4        7        5        5
5        7        9        5
6        5        7        5
  • starts_with however is not always safe.

Use dplyr within functions

  • sometimes we want to repeat this step for several variables

  • We can still achieve that with starts_with or other pronouns

  • starts_with(c("BAEUME","GRUENFLAECHEN"))

  • At some point, we still want to automate.

  • Maybe we want to use this specific function to various datasets

Mutate in a function

mutate_to_int <- function(.data, variables){
  
 .data %>% 
      mutate(across(variables, 
    as.integer))
}


try(savedata %>% mutate_to_int(BAEUME.1))
Error in .fun(.data, ...) : 
  Problem while computing `..1 = across(variables, as.integer)`.
Caused by error in `across()`:
! object 'BAEUME.1' not found
try(savedata<-mutate_to_int("BAEUME.1"))
Error in UseMethod("mutate") : 
  no applicable method for 'mutate' applied to an object of class "character"
  • Would be intuitive, but will not work.

Correctly mutate in a function

mutate_to_int <- function(.data, variables){
  
 .data %>% 
      mutate(across({{ variables }}, 
    as.integer))
}

all_database %>% mutate_to_int("BAEUME.1") %>% head()

all_database %>% mutate_to_int(starts_with("BAEUME")) %>% head()

all_database %>% mutate_to_int(everything()) %>% head()
all_database %>% mutate_to_int("BAEUME.1") %>% head()
     id choice DESIGN_ROW BAEUME.1 BAEUME.2 BAEUME.3 GRUENFLAECHEN.1
1 15169      1          6        9 7.032278 5.032278              34
2 15169      1          7        5 7.032278 5.032278              34
3 15169      1         10        5 9.032278 5.032278              44
4 15169      1         16        7 5.032278 5.032278              39
5 15169      1         25        7 9.032278 5.032278              44
6 15169      1         26        5 7.032278 5.032278              39
  GRUENFLAECHEN.2 GRUENFLAECHEN.3 NATURNAH.1 NATURNAH.2 NATURNAH.3 WEGE.1
1              39              34         35         45         25     40
2              39              34         35         45         25     40
3              34              34         25         45         25     40
4              44              34         35         45         25     30
5              34              34         45         25         25     40
6              34              34         25         35         25     30
  WEGE.2 WEGE.3 BEITRAG.1 BEITRAG.2 BEITRAG.3 i_NUMBER i_TIME Gender Education
1     30     30         6        60         0    15169    589      1         7
2     50     30        36       120         0    15169    589      1         7
3     30     30       120       240         0    15169    589      1         7
4     50     30        12        36         0    15169    589      1         7
5     50     30         6        12         0    15169    589      1         7
6     50     30       120       240         0    15169    589      1         7
  Income Stadt   PLZ estimated_Trees estimated_Greenspace estimated_Natural
1      1    23 86153              15                   30                30
2      1    23 86153              15                   30                30
3      1    23 86153              15                   30                30
4      1    23 86153              15                   30                30
5      1    23 86153              15                   30                30
6      1    23 86153              15                   30                30
  Block Balcony Garden Flat_size HH_size Num_Kids Age     City wohn_lon
1     3       0      1        62       1        0  59 Augsburg 10.91613
2     3       0      1        62       1        0  59 Augsburg 10.91613
3     3       0      1        62       1        0  59 Augsburg 10.91613
4     3       0      1        62       1        0  59 Augsburg 10.91613
5     3       0      1        62       1        0  59 Augsburg 10.91613
6     3       0      1        62       1        0  59 Augsburg 10.91613
  wohn_lat park_lon park_lat parkort wohnort SQ_Gruenflaechen SQ_Baeume
1 48.36318 10.93323 48.35956       1       1               34  5.032278
2 48.36318 10.93323 48.35956       1       1               34  5.032278
3 48.36318 10.93323 48.35956       1       1               34  5.032278
4 48.36318 10.93323 48.35956       1       1               34  5.032278
5 48.36318 10.93323 48.35956       1       1               34  5.032278
6 48.36318 10.93323 48.35956       1       1               34  5.032278
  SQ_naturnah SQ_Wege Population Pop_density mean_HHsize mean_Age mean_Income
1          25      30     299021    18.40764        2.01    44.28     1600.25
2          25      30     299021    18.40764        2.01    44.28     1600.25
3          25      30     299021    18.40764        2.01    44.28     1600.25
4          25      30     299021    18.40764        2.01    44.28     1600.25
5          25      30     299021    18.40764        2.01    44.28     1600.25
6          25      30     299021    18.40764        2.01    44.28     1600.25
  Hot_days Ost_West mean_Einkommen_level Ref_Baeume Ref_Gruenflaechen
1      8.6        0              1.60025          4                27
2      8.6        0              1.60025          4                27
3      8.6        0              1.60025          4                27
4      8.6        0              1.60025          4                27
5      8.6        0              1.60025          4                27
6      8.6        0              1.60025          4                27
  Ref_naturnah Ref_Wege Ref_Einkommen Ref_Alter Ref_HHGroesse Ref_Hitzetage
1           18       41           1.8        45             2            11
2           18       41           1.8        45             2            11
3           18       41           1.8        45             2            11
4           18       41           1.8        45             2            11
5           18       41           1.8        45             2            11
6           18       41           1.8        45             2            11
  Ref_Einwohner Ref_Einwohnerdichte Befragungsjahr Rekrutierung Einwohner_Zelle
1        750000                  23              0            0               0
2        750000                  23              0            0               0
3        750000                  23              0            0               0
4        750000                  23              0            0               0
5        750000                  23              0            0               0
6        750000                  23              0            0               0
  GF_Anteil Uni_degree
1      16.7          1
2      16.7          1
3      16.7          1
4      16.7          1
5      16.7          1
6      16.7          1
all_database %>% mutate_to_int(starts_with("BAEUME")) %>% head()
     id choice DESIGN_ROW BAEUME.1 BAEUME.2 BAEUME.3 GRUENFLAECHEN.1
1 15169      1          6        9        7        5              34
2 15169      1          7        5        7        5              34
3 15169      1         10        5        9        5              44
4 15169      1         16        7        5        5              39
5 15169      1         25        7        9        5              44
6 15169      1         26        5        7        5              39
  GRUENFLAECHEN.2 GRUENFLAECHEN.3 NATURNAH.1 NATURNAH.2 NATURNAH.3 WEGE.1
1              39              34         35         45         25     40
2              39              34         35         45         25     40
3              34              34         25         45         25     40
4              44              34         35         45         25     30
5              34              34         45         25         25     40
6              34              34         25         35         25     30
  WEGE.2 WEGE.3 BEITRAG.1 BEITRAG.2 BEITRAG.3 i_NUMBER i_TIME Gender Education
1     30     30         6        60         0    15169    589      1         7
2     50     30        36       120         0    15169    589      1         7
3     30     30       120       240         0    15169    589      1         7
4     50     30        12        36         0    15169    589      1         7
5     50     30         6        12         0    15169    589      1         7
6     50     30       120       240         0    15169    589      1         7
  Income Stadt   PLZ estimated_Trees estimated_Greenspace estimated_Natural
1      1    23 86153              15                   30                30
2      1    23 86153              15                   30                30
3      1    23 86153              15                   30                30
4      1    23 86153              15                   30                30
5      1    23 86153              15                   30                30
6      1    23 86153              15                   30                30
  Block Balcony Garden Flat_size HH_size Num_Kids Age     City wohn_lon
1     3       0      1        62       1        0  59 Augsburg 10.91613
2     3       0      1        62       1        0  59 Augsburg 10.91613
3     3       0      1        62       1        0  59 Augsburg 10.91613
4     3       0      1        62       1        0  59 Augsburg 10.91613
5     3       0      1        62       1        0  59 Augsburg 10.91613
6     3       0      1        62       1        0  59 Augsburg 10.91613
  wohn_lat park_lon park_lat parkort wohnort SQ_Gruenflaechen SQ_Baeume
1 48.36318 10.93323 48.35956       1       1               34  5.032278
2 48.36318 10.93323 48.35956       1       1               34  5.032278
3 48.36318 10.93323 48.35956       1       1               34  5.032278
4 48.36318 10.93323 48.35956       1       1               34  5.032278
5 48.36318 10.93323 48.35956       1       1               34  5.032278
6 48.36318 10.93323 48.35956       1       1               34  5.032278
  SQ_naturnah SQ_Wege Population Pop_density mean_HHsize mean_Age mean_Income
1          25      30     299021    18.40764        2.01    44.28     1600.25
2          25      30     299021    18.40764        2.01    44.28     1600.25
3          25      30     299021    18.40764        2.01    44.28     1600.25
4          25      30     299021    18.40764        2.01    44.28     1600.25
5          25      30     299021    18.40764        2.01    44.28     1600.25
6          25      30     299021    18.40764        2.01    44.28     1600.25
  Hot_days Ost_West mean_Einkommen_level Ref_Baeume Ref_Gruenflaechen
1      8.6        0              1.60025          4                27
2      8.6        0              1.60025          4                27
3      8.6        0              1.60025          4                27
4      8.6        0              1.60025          4                27
5      8.6        0              1.60025          4                27
6      8.6        0              1.60025          4                27
  Ref_naturnah Ref_Wege Ref_Einkommen Ref_Alter Ref_HHGroesse Ref_Hitzetage
1           18       41           1.8        45             2            11
2           18       41           1.8        45             2            11
3           18       41           1.8        45             2            11
4           18       41           1.8        45             2            11
5           18       41           1.8        45             2            11
6           18       41           1.8        45             2            11
  Ref_Einwohner Ref_Einwohnerdichte Befragungsjahr Rekrutierung Einwohner_Zelle
1        750000                  23              0            0               0
2        750000                  23              0            0               0
3        750000                  23              0            0               0
4        750000                  23              0            0               0
5        750000                  23              0            0               0
6        750000                  23              0            0               0
  GF_Anteil Uni_degree
1      16.7          1
2      16.7          1
3      16.7          1
4      16.7          1
5      16.7          1
6      16.7          1
all_database %>% mutate_to_int(everything()) %>% head()
     id choice DESIGN_ROW BAEUME.1 BAEUME.2 BAEUME.3 GRUENFLAECHEN.1
1 15169      1          6        9        7        5              34
2 15169      1          7        5        7        5              34
3 15169      1         10        5        9        5              44
4 15169      1         16        7        5        5              39
5 15169      1         25        7        9        5              44
6 15169      1         26        5        7        5              39
  GRUENFLAECHEN.2 GRUENFLAECHEN.3 NATURNAH.1 NATURNAH.2 NATURNAH.3 WEGE.1
1              39              34         35         45         25     40
2              39              34         35         45         25     40
3              34              34         25         45         25     40
4              44              34         35         45         25     30
5              34              34         45         25         25     40
6              34              34         25         35         25     30
  WEGE.2 WEGE.3 BEITRAG.1 BEITRAG.2 BEITRAG.3 i_NUMBER i_TIME Gender Education
1     30     30         6        60         0    15169    589      1         7
2     50     30        36       120         0    15169    589      1         7
3     30     30       120       240         0    15169    589      1         7
4     50     30        12        36         0    15169    589      1         7
5     50     30         6        12         0    15169    589      1         7
6     50     30       120       240         0    15169    589      1         7
  Income Stadt   PLZ estimated_Trees estimated_Greenspace estimated_Natural
1      1    23 86153              15                   30                30
2      1    23 86153              15                   30                30
3      1    23 86153              15                   30                30
4      1    23 86153              15                   30                30
5      1    23 86153              15                   30                30
6      1    23 86153              15                   30                30
  Block Balcony Garden Flat_size HH_size Num_Kids Age City wohn_lon wohn_lat
1     3       0      1        62       1        0  59   NA       10       48
2     3       0      1        62       1        0  59   NA       10       48
3     3       0      1        62       1        0  59   NA       10       48
4     3       0      1        62       1        0  59   NA       10       48
5     3       0      1        62       1        0  59   NA       10       48
6     3       0      1        62       1        0  59   NA       10       48
  park_lon park_lat parkort wohnort SQ_Gruenflaechen SQ_Baeume SQ_naturnah
1       10       48       1       1               34         5          25
2       10       48       1       1               34         5          25
3       10       48       1       1               34         5          25
4       10       48       1       1               34         5          25
5       10       48       1       1               34         5          25
6       10       48       1       1               34         5          25
  SQ_Wege Population Pop_density mean_HHsize mean_Age mean_Income Hot_days
1      30     299021          18           2       44        1600        8
2      30     299021          18           2       44        1600        8
3      30     299021          18           2       44        1600        8
4      30     299021          18           2       44        1600        8
5      30     299021          18           2       44        1600        8
6      30     299021          18           2       44        1600        8
  Ost_West mean_Einkommen_level Ref_Baeume Ref_Gruenflaechen Ref_naturnah
1        0                    1          4                27           18
2        0                    1          4                27           18
3        0                    1          4                27           18
4        0                    1          4                27           18
5        0                    1          4                27           18
6        0                    1          4                27           18
  Ref_Wege Ref_Einkommen Ref_Alter Ref_HHGroesse Ref_Hitzetage Ref_Einwohner
1       41             1        45             2            11        750000
2       41             1        45             2            11        750000
3       41             1        45             2            11        750000
4       41             1        45             2            11        750000
5       41             1        45             2            11        750000
6       41             1        45             2            11        750000
  Ref_Einwohnerdichte Befragungsjahr Rekrutierung Einwohner_Zelle GF_Anteil
1                  23              0            0               0        16
2                  23              0            0               0        16
3                  23              0            0               0        16
4                  23              0            0               0        16
5                  23              0            0               0        16
6                  23              0            0               0        16
  Uni_degree
1          1
2          1
3          1
4          1
5          1
6          1

Correctly mutate in a function cont.

  • This works fine.

  • The trick is to use curly brackets.

  • This is still not a good example

Making it more complex

  • We now want to create new variables which have a speficic ending
mutate_to_int2 <- function(.data, variables, new_name){
  .data %>% 
    mutate(across({{ variables }}, ~ as.integer(.), .names = paste0(new_name, "_{.col}")))
}


all_database %>% mutate_to_int2(contains("BAEUME"), "int") %>%
  select(matches("BAEUME")) %>%  head()
  BAEUME.1 BAEUME.2 BAEUME.3 SQ_Baeume Ref_Baeume int_BAEUME.1 int_BAEUME.2
1 9.032278 7.032278 5.032278  5.032278          4            9            7
2 5.032278 7.032278 5.032278  5.032278          4            5            7
3 5.032278 9.032278 5.032278  5.032278          4            5            9
4 7.032278 5.032278 5.032278  5.032278          4            7            5
5 7.032278 9.032278 5.032278  5.032278          4            7            9
6 5.032278 7.032278 5.032278  5.032278          4            5            7
  int_BAEUME.3 int_SQ_Baeume int_Ref_Baeume
1            5             5              4
2            5             5              4
3            5             5              4
4            5             5              4
5            5             5              4
6            5             5              4

Use case

  • Lets assume we have several datasets, with the same variables which we want to rename.

  • There are many ways to do this, but lets go for this

  • First, we use map to read in all the datasets

  • We use to identify the datasets and then read them in one after the other

Example

filenames <- list.files("data" , full.names = T)
names <- tools::file_path_sans_ext(basename(filenames))


all_sets <- map(filenames,readRDS) %>% 
setNames(names)



#head(all_sets)

Example cont.

  • We now want to apply our function to all datasets at once
new_sets <- map(all_sets,  mutate_to_int2 , starts_with("BAEUME"), "int")
  • We now have only one line to make the desired transformation to all datasets.

A very different example

  • Say you have a function that has some hardcoded mutations to the data

  • You want the user to provide additional mutations which depend on the dataset.

mymutate <- function(data, userexpressions){
  
  data %>% mutate(
    userexpressions,
    U=0.1*BAEUME.1 + 0.2*BAEUME.2
      )
  
  }

cont

  • The user should do whatever they want to do
userexpressions = list(BAEUME.1=     expr(BAEUME.1*10), 
                       BAEUME.2=     expr(BAEUME.2*10)
                       )


mynewdata <- function(data, userexpressions){
  
  data %>% mutate(
    !!! userexpressions,
    U=0.1*BAEUME.1 + 0.2*BAEUME.2
      )
  
  }

mynewdata(all_database, userexpressions) %>% select(starts_with("BAEUME"), U) %>% head()
  BAEUME.1 BAEUME.2 BAEUME.3        U
1 90.32278 70.32278 5.032278 23.09684
2 50.32278 70.32278 5.032278 19.09684
3 50.32278 90.32278 5.032278 23.09684
4 70.32278 50.32278 5.032278 17.09684
5 70.32278 90.32278 5.032278 25.09684
6 50.32278 70.32278 5.032278 19.09684
  • Instead of using !!, we have to use !!! because we work with a list

  • Also note that we wrote expr when defining the manipulations.

More advanced examples

  • You can also work with formulas

  • The user can write formulas as inputs

Integrate into function

  • Function to read formulas

  • Requires package formula.tools

library(formula.tools) 

 by_formula <- function(equation){ 
    cur_data_all() %>%
      transmute(!!lhs(equation) := !!rhs(equation) )
  } 
  • And then put it into mutate using map_dfc
all_database <-  all_database %>% 
      mutate(!!! userexpressions ,
             map_dfc(utils,by_formula))   #For each formula, we run the function

Further reading & cheat-sheets