Chapter 5 Tidy your data

5.1 what is tidy data?

Three rules:

  • Each variable must have its own column.

  • Each observation must have its own row.

  • Each value must have its own cell.

library(tidyverse)
library(knitr)
# tidy data
kable(table1, caption = "a This is a tidy dataset!")
Table 5.1: a This is a tidy dataset!
country year cases population
Afghanistan 1999 745 19987071
Afghanistan 2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 1999 212258 1272915272
China 2000 213766 1280428583
kable(table2, caption = "b case and population missing")
Table 5.1: b case and population missing
country year type count
Afghanistan 1999 cases 745
Afghanistan 1999 population 19987071
Afghanistan 2000 cases 2666
Afghanistan 2000 population 20595360
Brazil 1999 cases 37737
Brazil 1999 population 172006362
Brazil 2000 cases 80488
Brazil 2000 population 174504898
China 1999 cases 212258
China 1999 population 1272915272
China 2000 cases 213766
China 2000 population 1280428583
kable(table3, caption = "c case and population merged")
Table 5.1: c case and population merged
country year rate
Afghanistan 1999 745/19987071
Afghanistan 2000 2666/20595360
Brazil 1999 37737/172006362
Brazil 2000 80488/174504898
China 1999 212258/1272915272
China 2000 213766/1280428583
kable(table4a, caption = "d year missing, cases")
Table 5.1: d year missing, cases
country 1999 2000
Afghanistan 745 2666
Brazil 37737 80488
China 212258 213766
kable(table4b, caption = "e year missing, population")
Table 5.1: e year missing, population
country 1999 2000
Afghanistan 19987071 20595360
Brazil 172006362 174504898
China 1272915272 1280428583

5.2 Tools in tidyr

5.2.1 gather ( )

For some datasets, some of the column names are not names of variables but values of a variable.

kable(table4a, caption= "year missing, cases")
Table 5.2: year missing, cases
country 1999 2000
Afghanistan 745 2666
Brazil 37737 80488
China 212258 213766

We need to gather years like 1999 and 2000 into one variable year.

gather(table4a, '1999','2000',key = "year", value = "cases")
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766
gather(table4b, '1999','2000',key = "year", value = "population")
## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Brazil      1999   172006362
## 3 China       1999  1272915272
## 4 Afghanistan 2000    20595360
## 5 Brazil      2000   174504898
## 6 China       2000  1280428583

5.2.2 spread ( )

Sometime the observations are scattered across muiltiple rows.

kable(table2, caption = "b case and population missing")
Table 5.3: b case and population missing
country year type count
Afghanistan 1999 cases 745
Afghanistan 1999 population 19987071
Afghanistan 2000 cases 2666
Afghanistan 2000 population 20595360
Brazil 1999 cases 37737
Brazil 1999 population 172006362
Brazil 2000 cases 80488
Brazil 2000 population 174504898
China 1999 cases 212258
China 1999 population 1272915272
China 2000 cases 213766
China 2000 population 1280428583
spread(table2, key = type, value = count)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

To sum up, gather() makes wide tables long while spread() makes long tables wide. The key idea here is to think about the variables you want to explore and make sure that they are placed in different columns. This is very important for data visualization and stats modeling in R.


5.2.3 separate()

In some cases, one column may contain a combination of two variables. We need to separate these two variables.

kable(table3, caption = "c case and population merged")
Table 5.4: c case and population merged
country year rate
Afghanistan 1999 745/19987071
Afghanistan 2000 2666/20595360
Brazil 1999 37737/172006362
Brazil 2000 80488/174504898
China 1999 212258/1272915272
China 2000 213766/1280428583
#separate by a delimiter
separate(table3, rate, into = c("cases", "population"), sep = "/")
## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
#separate by the first two digits

separate(table3, rate, into = c("cases", "population"), sep = 2)
## # A tibble: 6 x 4
##   country      year cases population     
##   <chr>       <int> <chr> <chr>          
## 1 Afghanistan  1999 74    5/19987071     
## 2 Afghanistan  2000 26    66/20595360    
## 3 Brazil       1999 37    737/172006362  
## 4 Brazil       2000 80    488/174504898  
## 5 China        1999 21    2258/1272915272
## 6 China        2000 21    3766/1280428583
#separate by the last two digits

separate(table3, rate, into = c("cases", "population"), sep = -2)
## # A tibble: 6 x 4
##   country      year cases           population
##   <chr>       <int> <chr>           <chr>     
## 1 Afghanistan  1999 745/199870      71        
## 2 Afghanistan  2000 2666/205953     60        
## 3 Brazil       1999 37737/1720063   62        
## 4 Brazil       2000 80488/1745048   98        
## 5 China        1999 212258/12729152 72        
## 6 China        2000 213766/12804285 83

5.2.4 unite()

table_new = separate(table3, year, into = c("year1", "year2"), sep = 2)

table_new
## # A tibble: 6 x 4
##   country     year1 year2 rate             
##   <chr>       <chr> <chr> <chr>            
## 1 Afghanistan 19    99    745/19987071     
## 2 Afghanistan 20    00    2666/20595360    
## 3 Brazil      19    99    37737/172006362  
## 4 Brazil      20    00    80488/174504898  
## 5 China       19    99    212258/1272915272
## 6 China       20    00    213766/1280428583
unite(table_new, new, year1, year2)# by default there is an underscore between two parts.
## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 19_99 745/19987071     
## 2 Afghanistan 20_00 2666/20595360    
## 3 Brazil      19_99 37737/172006362  
## 4 Brazil      20_00 80488/174504898  
## 5 China       19_99 212258/1272915272
## 6 China       20_00 213766/1280428583
unite(table_new, new, year1, year2, sep = "")
## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583