

How to reshape your data in order to make the analysis easier.

Each variable is saved in its column.
Each observation is saved in its own row.
Image Credit: Hadley Wickham and Garrett Grolemund
library(tidyverse) #or library(tidyr)library(magrittr)

tidyr packageHadley Wickham, Chief Scientist at RStudio explaining tidyr at WOMBAT organized by Monash University, Australia.
Image taken by Thiyanga S Talagala at WOMBAT Melbourne, Australia, December-2019





pivot_longer
In tidyr (2014) gather
pivot_wider
In tidyr (2014) spread
separate
unite
Main input: data frame or tibble.
Output: tibble
pivot_longerpivot_longer()Turns columns into rows.
From wide format to long format.

pivot_longer()dengue <- tibble( dist = c("Colombo", "Gampaha", "Kalutara"), '2017' = c(20718, 10258, 34274), '2018' = c(16573, 5857, 31647), '2019' = c(8395, 3155, 10961)); dengue
# A tibble: 3 x 4 dist `2017` `2018` `2019` <chr> <dbl> <dbl> <dbl>1 Colombo 20718 16573 83952 Gampaha 10258 5857 31553 Kalutara 34274 31647 10961dengue %>% pivot_longer(2:4, names_to="Year", values_to = "Dengue counts")
# A tibble: 9 x 3 dist Year `Dengue counts` <chr> <chr> <dbl>1 Colombo 2017 207182 Colombo 2018 165733 Colombo 2019 83954 Gampaha 2017 102585 Gampaha 2018 58576 Gampaha 2019 31557 Kalutara 2017 342748 Kalutara 2018 316479 Kalutara 2019 10961pivot_widerpivot_wider()
pivot_wider()Corona <- tibble(country = rep(c("USA", "Brazil", "Russia"), each=2),status = rep(c("Death", "Recovered"), 3),count = c(99381, 451745, 22746, 149911, 3633, 118798))
Corona
# A tibble: 6 x 3 country status count <chr> <chr> <dbl>1 USA Death 993812 USA Recovered 4517453 Brazil Death 227464 Brazil Recovered 1499115 Russia Death 36336 Russia Recovered 118798pivot_wider()Corona
# A tibble: 6 x 3 country status count <chr> <chr> <dbl>1 USA Death 993812 USA Recovered 4517453 Brazil Death 227464 Brazil Recovered 1499115 Russia Death 36336 Russia Recovered 118798Corona %>% pivot_wider(names_from=status, values_from=count)
# A tibble: 3 x 3 country Death Recovered <chr> <dbl> <dbl>1 USA 99381 4517452 Brazil 22746 1499113 Russia 3633 118798corona_wide_format <- Corona %>% pivot_wider(names_from=status, values_from=count)corona_wide_format
# A tibble: 3 x 3 country Death Recovered <chr> <dbl> <dbl>1 USA 99381 4517452 Brazil 22746 1499113 Russia 3633 118798pivot_longer vs pivot_wider
pivot_longer and pivot_widerprofit <- tibble(year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),quarter = c( 1, 2, 3, 4, 2, 3, 4),income = c(2, NA, 3, NA, 4, 5, 6))profit
# A tibble: 7 x 3 year quarter income <dbl> <dbl> <dbl>1 2015 1 22 2015 2 NA3 2015 3 34 2015 4 NA5 2016 2 46 2016 3 57 2016 4 6pivot_longer and pivot_wider# A tibble: 7 x 3 year quarter income <dbl> <dbl> <dbl>1 2015 1 22 2015 2 NA3 2015 3 34 2015 4 NA5 2016 2 46 2016 3 57 2016 4 6profit %>%pivot_wider(names_from = year, values_from = income)
# A tibble: 4 x 3 quarter `2015` `2016` <dbl> <dbl> <dbl>1 1 2 NA2 2 NA 43 3 3 54 4 NA 6# A tibble: 4 x 3 quarter `2015` `2016` <dbl> <dbl> <dbl>1 1 2 NA2 2 NA 43 3 3 54 4 NA 6profit %>%pivot_wider(names_from = year, values_from = income) %>%pivot_longer(cols = c(`2015`, `2016`),names_to = "year",values_to = "income",values_drop_na = TRUE)
# A tibble: 5 x 3 quarter year income <dbl> <chr> <dbl>1 1 2015 22 2 2016 43 3 2015 34 3 2016 55 4 2016 6separateseparate()Melbourne <- tibble(Date = c("10-5-2020", "11-5-2020", "12-5-2020","13-5-2020"), Tmin = c(5, 9, 9, 7), Tmax = c(18, 16, 16, 17), Rainfall= c(30, 40, 10, 5)); Melbourne
# A tibble: 4 x 4 Date Tmin Tmax Rainfall <chr> <dbl> <dbl> <dbl>1 10-5-2020 5 18 302 11-5-2020 9 16 403 12-5-2020 9 16 104 13-5-2020 7 17 5Melbourne %>% separate(Date, into=c("day", "month", "year"), sep="-")
# A tibble: 4 x 6 day month year Tmin Tmax Rainfall <chr> <chr> <chr> <dbl> <dbl> <dbl>1 10 5 2020 5 18 302 11 5 2020 9 16 403 12 5 2020 9 16 104 13 5 2020 7 17 5separate()df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))df
x1 <NA>2 a.b3 a.d4 b.cdf %>% separate(x, c("Text1", "Text2"))
Text1 Text21 <NA> <NA>2 a b3 a d4 b cseparate()tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 x 1 input <chr>1 a 2 a b 3 a-b c4 <NA>separate()tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 x 1 input <chr>1 a 2 a b 3 a-b c4 <NA>tbl %>% separate(input, c("Input1", "Input2"))
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [3].## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].## # A tibble: 4 x 2## Input1 Input2## <chr> <chr> ## 1 a <NA> ## 2 a b ## 3 a b ## 4 <NA> <NA>separate()tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 x 1 input <chr>1 a 2 a b 3 a-b c4 <NA>separate()tbl <- tibble(input = c("a", "a b", "a-b c", NA)); tbl
# A tibble: 4 x 1 input <chr>1 a 2 a b 3 a-b c4 <NA>tbl %>% separate(input, c("Input1", "Input2", "Input3"))
tbl %>% separate(input, c("Input1", "Input2", "Input3"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [1, 2].## # A tibble: 4 x 3## Input1 Input2 Input3## <chr> <chr> <chr> ## 1 a <NA> <NA> ## 2 a b <NA> ## 3 a b c ## 4 <NA> <NA> <NA>uniteunite()projects <- tibble( Country = c("USA", "USA", "AUS", "AUS"), State = c("LA", "CO", "VIC", "NSW"), Cost = c(1000, 11000, 20000,30000))projects
# A tibble: 4 x 3 Country State Cost <chr> <chr> <dbl>1 USA LA 10002 USA CO 110003 AUS VIC 200004 AUS NSW 30000projects %>% unite("Location", c("State", "Country"))
# A tibble: 4 x 2 Location Cost <chr> <dbl>1 LA_USA 10002 CO_USA 110003 VIC_AUS 200004 NSW_AUS 30000unite()projects %>% unite("Location", c("State", "Country"))
# A tibble: 4 x 2 Location Cost <chr> <dbl>1 LA_USA 10002 CO_USA 110003 VIC_AUS 200004 NSW_AUS 30000projects %>% unite("Location", c("State", "Country"), sep="-")
# A tibble: 4 x 2 Location Cost <chr> <dbl>1 LA-USA 10002 CO-USA 110003 VIC-AUS 200004 NSW-AUS 30000separate vs unite

Keyboard shortcuts
| ↑, ←, Pg Up, k | Go to previous slide |
| ↓, →, Pg Dn, Space, j | Go to next slide |
| Home | Go to first slide |
| End | Go to last slide |
| Number + Return | Go to specific slide |
| b / m / f | Toggle blackout / mirrored / fullscreen mode |
| c | Clone slideshow |
| p | Toggle presenter mode |
| s | Start & Stop the presentation timer |
| t | Reset the presentation timer |
| ?, h | Toggle this help |
| Esc | Back to slideshow |