class: center, middle, title-slide # Data manipulation - solutions ### Olivier Gimenez ### last updated: 2022-05-16 --- ## Question 1a .tiny-font[ ```r penguins # display data ``` ``` ## # A tibble: 344 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen NA NA NA NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 ## 6 Adelie Torgersen 39.3 20.6 190 3650 ## 7 Adelie Torgersen 38.9 17.8 181 3625 ## 8 Adelie Torgersen 39.2 19.6 195 4675 ## 9 Adelie Torgersen 34.1 18.1 193 3475 ## 10 Adelie Torgersen 42 20.2 190 4250 ## # … with 334 more rows, and 2 more variables: sex <fct>, year <int> ``` ] --- ## Question 1a .tiny-font[ ```r penguins %>% glimpse() # display data ``` ``` ## Rows: 344 ## Columns: 8 ## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel… ## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse… ## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, … ## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, … ## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186… ## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, … ## $ sex <fct> male, female, female, NA, female, male, female, male… ## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007… ``` ] --- ## Question 1b ```r dat <- penguins %>% # filter out missing sex filter(!is.na(sex)) %>% # select variables select(species, island, bill_length_mm, body_mass_g) ``` --- ## Question 2a .tiny-font[ ```r dat # nb of penguins ``` ``` ## # A tibble: 333 × 4 ## species island bill_length_mm body_mass_g ## <fct> <fct> <dbl> <int> ## 1 Adelie Torgersen 39.1 3750 ## 2 Adelie Torgersen 39.5 3800 ## 3 Adelie Torgersen 40.3 3250 ## 4 Adelie Torgersen 36.7 3450 ## 5 Adelie Torgersen 39.3 3650 ## 6 Adelie Torgersen 38.9 3625 ## 7 Adelie Torgersen 39.2 4675 ## 8 Adelie Torgersen 41.1 3200 ## 9 Adelie Torgersen 38.6 3800 ## 10 Adelie Torgersen 34.6 4400 ## # … with 323 more rows ``` ] --- ## Questions 2b and 2d ```r # nb of species, and penguins per species dat %>% count(species) ``` ``` ## # A tibble: 3 × 2 ## species n ## <fct> <int> ## 1 Adelie 146 ## 2 Chinstrap 68 ## 3 Gentoo 119 ``` --- ## Questions 2b and 2d ```r dat %>% count(species, sort = TRUE) # idem, arranged by n ``` ``` ## # A tibble: 3 × 2 ## species n ## <fct> <int> ## 1 Adelie 146 ## 2 Gentoo 119 ## 3 Chinstrap 68 ``` --- ## Question 2c ```r dat %>% count(island) # nb of island, and penguins per island ``` ``` ## # A tibble: 3 × 2 ## island n ## <fct> <int> ## 1 Biscoe 163 ## 2 Dream 123 ## 3 Torgersen 47 ``` --- ## Question 2e ```r # penguins per species and island dat %>% count(species, island) ``` ``` ## # A tibble: 5 × 3 ## species island n ## <fct> <fct> <int> ## 1 Adelie Biscoe 44 ## 2 Adelie Dream 55 ## 3 Adelie Torgersen 47 ## 4 Chinstrap Dream 68 ## 5 Gentoo Biscoe 119 ``` --- ## Question 3a: mean body mass .tiny-font[ ```r # option 1 dat %>% mutate(mean_bm = mean(body_mass_g)) ``` ``` ## # A tibble: 333 × 5 ## species island bill_length_mm body_mass_g mean_bm ## <fct> <fct> <dbl> <int> <dbl> ## 1 Adelie Torgersen 39.1 3750 4207. ## 2 Adelie Torgersen 39.5 3800 4207. ## 3 Adelie Torgersen 40.3 3250 4207. ## 4 Adelie Torgersen 36.7 3450 4207. ## 5 Adelie Torgersen 39.3 3650 4207. ## 6 Adelie Torgersen 38.9 3625 4207. ## 7 Adelie Torgersen 39.2 4675 4207. ## 8 Adelie Torgersen 41.1 3200 4207. ## 9 Adelie Torgersen 38.6 3800 4207. ## 10 Adelie Torgersen 34.6 4400 4207. ## # … with 323 more rows ``` ] --- ## Question 3a: mean body mass ```r # option 2 dat %>% summarise(mean_bm = mean(body_mass_g)) ``` ``` ## # A tibble: 1 × 1 ## mean_bm ## <dbl> ## 1 4207. ``` --- ## Question 3b: mean body mass per species .tiny-font[ ```r # option 1 dat %>% group_by(species) %>% mutate(mean_bm = mean(body_mass_g)) ``` ``` ## # A tibble: 333 × 5 ## # Groups: species [3] ## species island bill_length_mm body_mass_g mean_bm ## <fct> <fct> <dbl> <int> <dbl> ## 1 Adelie Torgersen 39.1 3750 3706. ## 2 Adelie Torgersen 39.5 3800 3706. ## 3 Adelie Torgersen 40.3 3250 3706. ## 4 Adelie Torgersen 36.7 3450 3706. ## 5 Adelie Torgersen 39.3 3650 3706. ## 6 Adelie Torgersen 38.9 3625 3706. ## 7 Adelie Torgersen 39.2 4675 3706. ## 8 Adelie Torgersen 41.1 3200 3706. ## 9 Adelie Torgersen 38.6 3800 3706. ## 10 Adelie Torgersen 34.6 4400 3706. ## # … with 323 more rows ``` ] --- ## Question 3b: mean body mass per species ```r # option 2 dat %>% group_by(species) %>% summarize(mean_bm = mean(body_mass_g)) ``` ``` ## # A tibble: 3 × 2 ## species mean_bm ## <fct> <dbl> ## 1 Adelie 3706. ## 2 Chinstrap 3733. ## 3 Gentoo 5092. ``` --- ## Question 3c: mean traits ```r # all at once, through column selection dat %>% group_by(species) %>% summarize(across(bill_length_mm:body_mass_g, mean)) ``` ``` ## # A tibble: 3 × 3 ## species bill_length_mm body_mass_g ## <fct> <dbl> <dbl> ## 1 Adelie 38.8 3706. ## 2 Chinstrap 48.8 3733. ## 3 Gentoo 47.6 5092. ``` --- ## Question 3c: mean traits ```r # all at once, through column format selection dat %>% group_by(species) %>% summarize(across(where(is.numeric), mean)) ``` ``` ## # A tibble: 3 × 3 ## species bill_length_mm body_mass_g ## <fct> <dbl> <dbl> ## 1 Adelie 38.8 3706. ## 2 Chinstrap 48.8 3733. ## 3 Gentoo 47.6 5092. ```