Chapter 6 Tranform your data

6.1 Selecting variables of interest

When we get a dataset, one of the most important things to do is to pick the variables we need, especially when the raw dataset has too many columns.

library(tidyverse)

# getting the data
filename = list.files(paste0(getwd(), "/data"),full.names=T, pattern = ".csv") # get only csv files

df = data.frame()
bin = data.frame()
for (i in 1:length(filename)){
  # length(filename) = how many files
  bin = read.csv(filename[i])
  df = rbind(df, bin)
}

#head(df)

#colnames(df)

In this dataset df, we need to get the ID of participants as stored in Subject, the stimuli they heard in “tone.Trial”, their response in “insex1.RESP”, their response time in “insex1.RT”, and the experiment conditions in “Procedure.Block.”.

df_new = select(df, subject = "Subject",
                stimuli = "tone.Trial.",
                response = "insex1.RESP",
                response_rt = "insex1.RT",
                block = "Procedure.Block.",
                ExperimentName)
head(df_new)
##   subject stimuli response response_rt    block           ExperimentName
## 1     402      NA                   NA pracproc Assim_main_chinese_500ms
## 2     402      NA                   NA pracproc Assim_main_chinese_500ms
## 3     402      NA                   NA pracproc Assim_main_chinese_500ms
## 4     402      NA                   NA pracproc Assim_main_chinese_500ms
## 5     402      NA                   NA pracproc Assim_main_chinese_500ms
## 6     402      NA                   NA pracproc Assim_main_chinese_500ms
# select by names
head(select(df_new, subject: response))
##   subject stimuli response
## 1     402      NA         
## 2     402      NA         
## 3     402      NA         
## 4     402      NA         
## 5     402      NA         
## 6     402      NA
head(select(df_new, subject: response))
##   subject stimuli response
## 1     402      NA         
## 2     402      NA         
## 3     402      NA         
## 4     402      NA         
## 5     402      NA         
## 6     402      NA
head(select(df_new, 1: 3))
##   subject stimuli response
## 1     402      NA         
## 2     402      NA         
## 3     402      NA         
## 4     402      NA         
## 5     402      NA         
## 6     402      NA
head(select(df_new, -subject))
##   stimuli response response_rt    block           ExperimentName
## 1      NA                   NA pracproc Assim_main_chinese_500ms
## 2      NA                   NA pracproc Assim_main_chinese_500ms
## 3      NA                   NA pracproc Assim_main_chinese_500ms
## 4      NA                   NA pracproc Assim_main_chinese_500ms
## 5      NA                   NA pracproc Assim_main_chinese_500ms
## 6      NA                   NA pracproc Assim_main_chinese_500ms
head(select(df_new, ExperimentName, everything()))
##             ExperimentName subject stimuli response response_rt    block
## 1 Assim_main_chinese_500ms     402      NA                   NA pracproc
## 2 Assim_main_chinese_500ms     402      NA                   NA pracproc
## 3 Assim_main_chinese_500ms     402      NA                   NA pracproc
## 4 Assim_main_chinese_500ms     402      NA                   NA pracproc
## 5 Assim_main_chinese_500ms     402      NA                   NA pracproc
## 6 Assim_main_chinese_500ms     402      NA                   NA pracproc
# change column names

head(rename(df_new, exp = ExperimentName))
##   subject stimuli response response_rt    block                      exp
## 1     402      NA                   NA pracproc Assim_main_chinese_500ms
## 2     402      NA                   NA pracproc Assim_main_chinese_500ms
## 3     402      NA                   NA pracproc Assim_main_chinese_500ms
## 4     402      NA                   NA pracproc Assim_main_chinese_500ms
## 5     402      NA                   NA pracproc Assim_main_chinese_500ms
## 6     402      NA                   NA pracproc Assim_main_chinese_500ms

6.2 Filtering observations

Not all the data in the raw dataset are useful. We can use filter( ) function to select rows or observations based on some criteria.

Three useful logical operators for forming conditions:

  • & means “and”

  • | means “or”

  • ! means “not”

# removing data from the practice block
df_new = filter(df_new, block != "pracproc")
head(df_new)
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      33        f         672 block6 Assim_main_chinese_500ms
## 2     402     315        j        2831 block6 Assim_main_chinese_500ms
## 3     402      45                    0 block6 Assim_main_chinese_500ms
## 4     402      21                    0 block6 Assim_main_chinese_500ms
## 5     402      33        f        1041 block6 Assim_main_chinese_500ms
## 6     402     241                    0 block6 Assim_main_chinese_500ms
# removing missing data in response column
df_new = filter(df_new, !is.na(response))
df_new = filter(df_new, response != "")
head(df_new)
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      33        f         672 block6 Assim_main_chinese_500ms
## 2     402     315        j        2831 block6 Assim_main_chinese_500ms
## 3     402      33        f        1041 block6 Assim_main_chinese_500ms
## 4     402     315        j         363 block6 Assim_main_chinese_500ms
## 5     402      21        f        1234 block6 Assim_main_chinese_500ms
## 6     402      45        j         322 block6 Assim_main_chinese_500ms
# pick up stimuli 33 and 21

head(filter(df_new, stimuli == 33 | stimuli == 21))
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      33        f         672 block6 Assim_main_chinese_500ms
## 2     402      33        f        1041 block6 Assim_main_chinese_500ms
## 3     402      21        f        1234 block6 Assim_main_chinese_500ms
## 4     402      33        f         150 block6 Assim_main_chinese_500ms
## 5     402      33        f         206 block6 Assim_main_chinese_500ms
## 6     402      21        f          52 block6 Assim_main_chinese_500ms
head(filter(df_new, stimuli %in% c(33, 21)))
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      33        f         672 block6 Assim_main_chinese_500ms
## 2     402      33        f        1041 block6 Assim_main_chinese_500ms
## 3     402      21        f        1234 block6 Assim_main_chinese_500ms
## 4     402      33        f         150 block6 Assim_main_chinese_500ms
## 5     402      33        f         206 block6 Assim_main_chinese_500ms
## 6     402      21        f          52 block6 Assim_main_chinese_500ms
# removing data with respones time shorter than 200

head(filter(df_new, response_rt < 200))
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      45        j         109 block6 Assim_main_chinese_500ms
## 2     402      33        f         150 block6 Assim_main_chinese_500ms
## 3     402      21        f          52 block6 Assim_main_chinese_500ms
## 4     402      33        f         107 block5 Assim_main_chinese_500ms
## 5     402      33        f         153 block5 Assim_main_chinese_500ms
## 6     402      21        f          95 block2 Assim_main_chinese_500ms
head(filter(df_new, response_rt > 200))
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      33        f         672 block6 Assim_main_chinese_500ms
## 2     402     315        j        2831 block6 Assim_main_chinese_500ms
## 3     402      33        f        1041 block6 Assim_main_chinese_500ms
## 4     402     315        j         363 block6 Assim_main_chinese_500ms
## 5     402      21        f        1234 block6 Assim_main_chinese_500ms
## 6     402      45        j         322 block6 Assim_main_chinese_500ms

6.3 Changing the order of rows

# arrange the dataframe by stimuli and response_rt columns
head(arrange(df_new, stimuli, response_rt))
##   subject stimuli response response_rt  block              ExperimentName
## 1     418      21        g           6 block7 assim_main_vietnamese_500ms
## 2     418      21        g          51 block4 assim_main_vietnamese_500ms
## 3     402      21        f          52 block6    Assim_main_chinese_500ms
## 4     418      21        g          85 block1 assim_main_vietnamese_500ms
## 5     402      21        f          95 block2    Assim_main_chinese_500ms
## 6     418      21        g          99 block7 assim_main_vietnamese_500ms
# arrange the dataframe by stimuli and the descending order of block columns
head(arrange(df_new, stimuli, desc(block)))
##   subject stimuli response response_rt  block              ExperimentName
## 1     402      21        f         609 block7    Assim_main_chinese_500ms
## 2     402      21        f         224 block7    Assim_main_chinese_500ms
## 3     402      21        f         286 block7    Assim_main_chinese_500ms
## 4     402      21        f        1136 block7    Assim_main_chinese_500ms
## 5     418      21        g           6 block7 assim_main_vietnamese_500ms
## 6     418      21        g         331 block7 assim_main_vietnamese_500ms

6.4 Generating new variables

We can generate new variables based on existing variables using the function mutate() and transmutate(). The difference is mutate will keep the original variables while transmutate will erase orginal variables.

# you can add, minus, log transform any numberic column
head(mutate(df_new, response_rt_new = log(response_rt)))
##   subject stimuli response response_rt  block           ExperimentName
## 1     402      33        f         672 block6 Assim_main_chinese_500ms
## 2     402     315        j        2831 block6 Assim_main_chinese_500ms
## 3     402      33        f        1041 block6 Assim_main_chinese_500ms
## 4     402     315        j         363 block6 Assim_main_chinese_500ms
## 5     402      21        f        1234 block6 Assim_main_chinese_500ms
## 6     402      45        j         322 block6 Assim_main_chinese_500ms
##   response_rt_new
## 1        6.510258
## 2        7.948385
## 3        6.947937
## 4        5.894403
## 5        7.118016
## 6        5.774552
# you can extract part of the information from a character variable

head(mutate(df_new, ISI = str_extract(ExperimentName, "2000|500")))
##   subject stimuli response response_rt  block           ExperimentName ISI
## 1     402      33        f         672 block6 Assim_main_chinese_500ms 500
## 2     402     315        j        2831 block6 Assim_main_chinese_500ms 500
## 3     402      33        f        1041 block6 Assim_main_chinese_500ms 500
## 4     402     315        j         363 block6 Assim_main_chinese_500ms 500
## 5     402      21        f        1234 block6 Assim_main_chinese_500ms 500
## 6     402      45        j         322 block6 Assim_main_chinese_500ms 500
head(transmute(df_new, ISI = str_extract(ExperimentName, "2000|500")))
##   ISI
## 1 500
## 2 500
## 3 500
## 4 500
## 5 500
## 6 500
# you can paste two variabls together

head(transmute(df_new, ISI = paste(stimuli, block, sep = "_")))
##          ISI
## 1  33_block6
## 2 315_block6
## 3  33_block6
## 4 315_block6
## 5  21_block6
## 6  45_block6
# you can recode a variable

head( mutate(df_new, block = recode(block, 
                         block1 = "ss", block2 = "ss",
                         block3 = "sd", block4 = "sd",
                         block5 = "ds", block6 = "ds",
                         block7 = "dd")))
##   subject stimuli response response_rt block           ExperimentName
## 1     402      33        f         672    ds Assim_main_chinese_500ms
## 2     402     315        j        2831    ds Assim_main_chinese_500ms
## 3     402      33        f        1041    ds Assim_main_chinese_500ms
## 4     402     315        j         363    ds Assim_main_chinese_500ms
## 5     402      21        f        1234    ds Assim_main_chinese_500ms
## 6     402      45        j         322    ds Assim_main_chinese_500ms

6.5 The pipeline “%>%”

You may have noticed the repetition in the above codes. We have to specify the dataframe every time we use a function and have to store it in a new name if we want to keep the original dataframe.

Using pipeline %>% can reduce the redundancy and the mistake we may make when we have a dozen of dataframes to refer to.

df_final = df %>%
  # selecting the columns we need
  select(., subject = "Subject",
                stimuli = "tone.Trial.",
                response = "insex1.RESP",
                response_rt = "insex1.RT",
                block = "Procedure.Block.",
                exp = ExperimentName)%>%
  #filtering out useless data
  filter(block != "pracproc" & !is.na(response) & response != "")%>%
  # generating new variables based on old variables
  mutate(ISI = str_extract(exp, "2000|500"),
         block = recode(block, 
                         block1 = "ss", block2 = "ss",
                         block3 = "sd", block4 = "sd",
                         block5 = "ds", block6 = "ds",
                         block7 = "dd"))

6.6 Summarizing

Summarize() and group_by () are often used together to give us some basic summary of the data.

df_final%>%
  group_by(subject)%>%
  summarize(count = n())
## # A tibble: 2 x 2
##   subject count
##     <int> <int>
## 1     402   221
## 2     418   139
df_final%>%
  group_by(subject, stimuli)%>%
  summarize(rt = mean(response_rt))
## # A tibble: 10 x 3
## # Groups:   subject [2]
##    subject stimuli    rt
##      <int>   <int> <dbl>
##  1     402      21  747.
##  2     402      33  688.
##  3     402      45  831.
##  4     402     241  747.
##  5     402     315  787.
##  6     418      21  578.
##  7     418      33  553.
##  8     418      45  574.
##  9     418     241  715.
## 10     418     315  694.
df_final%>%
  group_by(stimuli, response)%>%
  mutate(counter = 1)%>%
  summarize(count = sum(counter))%>%
  spread(stimuli, value = count)
## # A tibble: 5 x 6
##   response  `21`  `33`  `45` `241` `315`
##   <fct>    <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 f           24    21     1    12    NA
## 2 j           NA    NA    42     1    15
## 3 d            1    11    NA    49    NA
## 4 g           54    45    NA     6    NA
## 5 h           NA    NA    23    NA    55
df_final%>%
  group_by(stimuli,response)%>%
  mutate(counter = 1)%>%
  summarize(counter = sum(counter))%>%
  mutate( percentage = counter/sum(counter),
          sum = sum(counter))
## # A tibble: 15 x 5
## # Groups:   stimuli [5]
##    stimuli response counter percentage   sum
##      <int> <fct>      <dbl>      <dbl> <dbl>
##  1      21 f             24     0.304     79
##  2      21 d              1     0.0127    79
##  3      21 g             54     0.684     79
##  4      33 f             21     0.273     77
##  5      33 d             11     0.143     77
##  6      33 g             45     0.584     77
##  7      45 f              1     0.0152    66
##  8      45 j             42     0.636     66
##  9      45 h             23     0.348     66
## 10     241 f             12     0.176     68
## 11     241 j              1     0.0147    68
## 12     241 d             49     0.721     68
## 13     241 g              6     0.0882    68
## 14     315 j             15     0.214     70
## 15     315 h             55     0.786     70
table.final = df_final%>%
  group_by(stimuli,response)%>%
  mutate(counter = 1)%>%
  summarize(counter = sum(counter))%>%
  mutate( percentage = round(counter/sum(counter),2),
          sum = sum(counter))%>%
  select(stimuli,percentage, response)%>%
  spread(stimuli, value = percentage)

library(knitr)
kable(table.final, caption = "Percentage of choice")
Table 6.1: Percentage of choice
response 21 33 45 241 315
f 0.30 0.27 0.02 0.18 NA
j NA NA 0.64 0.01 0.21
d 0.01 0.14 NA 0.72 NA
g 0.68 0.58 NA 0.09 NA
h NA NA 0.35 NA 0.79