Chapter 5 Tidy your data

5.1 what is tidy data?

Three rules:

Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.

library(tidyverse)
library(knitr)
# tidy data
kable(table1, caption = "a This is a tidy dataset!")

Table 5.1: a This is a tidy dataset!
country	year	cases	population
Afghanistan	1999	745	19987071
Afghanistan	2000	2666	20595360
Brazil	1999	37737	172006362
Brazil	2000	80488	174504898
China	1999	212258	1272915272
China	2000	213766	1280428583

kable(table2, caption = "b case and population missing")

Table 5.1: b case and population missing
country	year	type	count
Afghanistan	1999	cases	745
Afghanistan	1999	population	19987071
Afghanistan	2000	cases	2666
Afghanistan	2000	population	20595360
Brazil	1999	cases	37737
Brazil	1999	population	172006362
Brazil	2000	cases	80488
Brazil	2000	population	174504898
China	1999	cases	212258
China	1999	population	1272915272
China	2000	cases	213766
China	2000	population	1280428583

kable(table3, caption = "c case and population merged")

Table 5.1: c case and population merged
country	year	rate
Afghanistan	1999	745/19987071
Afghanistan	2000	2666/20595360
Brazil	1999	37737/172006362
Brazil	2000	80488/174504898
China	1999	212258/1272915272
China	2000	213766/1280428583

kable(table4a, caption = "d year missing, cases")

Table 5.1: d year missing, cases
country	1999	2000
Afghanistan	745	2666
Brazil	37737	80488
China	212258	213766

kable(table4b, caption = "e year missing, population")

Table 5.1: e year missing, population
country	1999	2000
Afghanistan	19987071	20595360
Brazil	172006362	174504898
China	1272915272	1280428583

5.2 Tools in tidyr

5.2.1 gather ( )

For some datasets, some of the column names are not names of variables but values of a variable.

kable(table4a, caption= "year missing, cases")

Table 5.2: year missing, cases
country	1999	2000
Afghanistan	745	2666
Brazil	37737	80488
China	212258	213766

We need to gather years like 1999 and 2000 into one variable year.

gather(table4a, '1999','2000',key = "year", value = "cases")

## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766

gather(table4b, '1999','2000',key = "year", value = "population")

## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Brazil      1999   172006362
## 3 China       1999  1272915272
## 4 Afghanistan 2000    20595360
## 5 Brazil      2000   174504898
## 6 China       2000  1280428583

5.2.2 spread ( )

Sometime the observations are scattered across muiltiple rows.

kable(table2, caption = "b case and population missing")

Table 5.3: b case and population missing
country	year	type	count
Afghanistan	1999	cases	745
Afghanistan	1999	population	19987071
Afghanistan	2000	cases	2666
Afghanistan	2000	population	20595360
Brazil	1999	cases	37737
Brazil	1999	population	172006362
Brazil	2000	cases	80488
Brazil	2000	population	174504898
China	1999	cases	212258
China	1999	population	1272915272
China	2000	cases	213766
China	2000	population	1280428583

spread(table2, key = type, value = count)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

To sum up, gather() makes wide tables long while spread() makes long tables wide. The key idea here is to think about the variables you want to explore and make sure that they are placed in different columns. This is very important for data visualization and stats modeling in R.

5.2.3 separate()

In some cases, one column may contain a combination of two variables. We need to separate these two variables.

kable(table3, caption = "c case and population merged")

Table 5.4: c case and population merged
country	year	rate
Afghanistan	1999	745/19987071
Afghanistan	2000	2666/20595360
Brazil	1999	37737/172006362
Brazil	2000	80488/174504898
China	1999	212258/1272915272
China	2000	213766/1280428583

#separate by a delimiter
separate(table3, rate, into = c("cases", "population"), sep = "/")

## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

#separate by the first two digits

separate(table3, rate, into = c("cases", "population"), sep = 2)

## # A tibble: 6 x 4
##   country      year cases population     
##   <chr>       <int> <chr> <chr>          
## 1 Afghanistan  1999 74    5/19987071     
## 2 Afghanistan  2000 26    66/20595360    
## 3 Brazil       1999 37    737/172006362  
## 4 Brazil       2000 80    488/174504898  
## 5 China        1999 21    2258/1272915272
## 6 China        2000 21    3766/1280428583

#separate by the last two digits

separate(table3, rate, into = c("cases", "population"), sep = -2)

## # A tibble: 6 x 4
##   country      year cases           population
##   <chr>       <int> <chr>           <chr>     
## 1 Afghanistan  1999 745/199870      71        
## 2 Afghanistan  2000 2666/205953     60        
## 3 Brazil       1999 37737/1720063   62        
## 4 Brazil       2000 80488/1745048   98        
## 5 China        1999 212258/12729152 72        
## 6 China        2000 213766/12804285 83

5.2.4 unite()

table_new = separate(table3, year, into = c("year1", "year2"), sep = 2)

table_new

## # A tibble: 6 x 4
##   country     year1 year2 rate             
##   <chr>       <chr> <chr> <chr>            
## 1 Afghanistan 19    99    745/19987071     
## 2 Afghanistan 20    00    2666/20595360    
## 3 Brazil      19    99    37737/172006362  
## 4 Brazil      20    00    80488/174504898  
## 5 China       19    99    212258/1272915272
## 6 China       20    00    213766/1280428583

unite(table_new, new, year1, year2)# by default there is an underscore between two parts.

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 19_99 745/19987071     
## 2 Afghanistan 20_00 2666/20595360    
## 3 Brazil      19_99 37737/172006362  
## 4 Brazil      20_00 80488/174504898  
## 5 China       19_99 212258/1272915272
## 6 China       20_00 213766/1280428583

unite(table_new, new, year1, year2, sep = "")

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583