Chapter 6 Tranform your data
6.1 Selecting variables of interest
When we get a dataset, one of the most important things to do is to pick the variables we need, especially when the raw dataset has too many columns.
library(tidyverse)
# getting the data
filename = list.files(paste0(getwd(), "/data"),full.names=T, pattern = ".csv") # get only csv files
df = data.frame()
bin = data.frame()
for (i in 1:length(filename)){
# length(filename) = how many files
bin = read.csv(filename[i])
df = rbind(df, bin)
}
#head(df)
#colnames(df)
In this dataset df, we need to get the ID of participants as stored in Subject, the stimuli they heard in “tone.Trial”, their response in “insex1.RESP”, their response time in “insex1.RT”, and the experiment conditions in “Procedure.Block.”.
df_new = select(df, subject = "Subject",
stimuli = "tone.Trial.",
response = "insex1.RESP",
response_rt = "insex1.RT",
block = "Procedure.Block.",
ExperimentName)
head(df_new)
## subject stimuli response response_rt block ExperimentName
## 1 402 NA NA pracproc Assim_main_chinese_500ms
## 2 402 NA NA pracproc Assim_main_chinese_500ms
## 3 402 NA NA pracproc Assim_main_chinese_500ms
## 4 402 NA NA pracproc Assim_main_chinese_500ms
## 5 402 NA NA pracproc Assim_main_chinese_500ms
## 6 402 NA NA pracproc Assim_main_chinese_500ms
# select by names
head(select(df_new, subject: response))
## subject stimuli response
## 1 402 NA
## 2 402 NA
## 3 402 NA
## 4 402 NA
## 5 402 NA
## 6 402 NA
head(select(df_new, subject: response))
## subject stimuli response
## 1 402 NA
## 2 402 NA
## 3 402 NA
## 4 402 NA
## 5 402 NA
## 6 402 NA
head(select(df_new, 1: 3))
## subject stimuli response
## 1 402 NA
## 2 402 NA
## 3 402 NA
## 4 402 NA
## 5 402 NA
## 6 402 NA
head(select(df_new, -subject))
## stimuli response response_rt block ExperimentName
## 1 NA NA pracproc Assim_main_chinese_500ms
## 2 NA NA pracproc Assim_main_chinese_500ms
## 3 NA NA pracproc Assim_main_chinese_500ms
## 4 NA NA pracproc Assim_main_chinese_500ms
## 5 NA NA pracproc Assim_main_chinese_500ms
## 6 NA NA pracproc Assim_main_chinese_500ms
head(select(df_new, ExperimentName, everything()))
## ExperimentName subject stimuli response response_rt block
## 1 Assim_main_chinese_500ms 402 NA NA pracproc
## 2 Assim_main_chinese_500ms 402 NA NA pracproc
## 3 Assim_main_chinese_500ms 402 NA NA pracproc
## 4 Assim_main_chinese_500ms 402 NA NA pracproc
## 5 Assim_main_chinese_500ms 402 NA NA pracproc
## 6 Assim_main_chinese_500ms 402 NA NA pracproc
# change column names
head(rename(df_new, exp = ExperimentName))
## subject stimuli response response_rt block exp
## 1 402 NA NA pracproc Assim_main_chinese_500ms
## 2 402 NA NA pracproc Assim_main_chinese_500ms
## 3 402 NA NA pracproc Assim_main_chinese_500ms
## 4 402 NA NA pracproc Assim_main_chinese_500ms
## 5 402 NA NA pracproc Assim_main_chinese_500ms
## 6 402 NA NA pracproc Assim_main_chinese_500ms
6.2 Filtering observations
Not all the data in the raw dataset are useful. We can use filter( ) function to select rows or observations based on some criteria.
Three useful logical operators for forming conditions:
& means “and”
| means “or”
! means “not”
# removing data from the practice block
df_new = filter(df_new, block != "pracproc")
head(df_new)
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 block6 Assim_main_chinese_500ms
## 2 402 315 j 2831 block6 Assim_main_chinese_500ms
## 3 402 45 0 block6 Assim_main_chinese_500ms
## 4 402 21 0 block6 Assim_main_chinese_500ms
## 5 402 33 f 1041 block6 Assim_main_chinese_500ms
## 6 402 241 0 block6 Assim_main_chinese_500ms
# removing missing data in response column
df_new = filter(df_new, !is.na(response))
df_new = filter(df_new, response != "")
head(df_new)
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 block6 Assim_main_chinese_500ms
## 2 402 315 j 2831 block6 Assim_main_chinese_500ms
## 3 402 33 f 1041 block6 Assim_main_chinese_500ms
## 4 402 315 j 363 block6 Assim_main_chinese_500ms
## 5 402 21 f 1234 block6 Assim_main_chinese_500ms
## 6 402 45 j 322 block6 Assim_main_chinese_500ms
# pick up stimuli 33 and 21
head(filter(df_new, stimuli == 33 | stimuli == 21))
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 block6 Assim_main_chinese_500ms
## 2 402 33 f 1041 block6 Assim_main_chinese_500ms
## 3 402 21 f 1234 block6 Assim_main_chinese_500ms
## 4 402 33 f 150 block6 Assim_main_chinese_500ms
## 5 402 33 f 206 block6 Assim_main_chinese_500ms
## 6 402 21 f 52 block6 Assim_main_chinese_500ms
head(filter(df_new, stimuli %in% c(33, 21)))
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 block6 Assim_main_chinese_500ms
## 2 402 33 f 1041 block6 Assim_main_chinese_500ms
## 3 402 21 f 1234 block6 Assim_main_chinese_500ms
## 4 402 33 f 150 block6 Assim_main_chinese_500ms
## 5 402 33 f 206 block6 Assim_main_chinese_500ms
## 6 402 21 f 52 block6 Assim_main_chinese_500ms
# removing data with respones time shorter than 200
head(filter(df_new, response_rt < 200))
## subject stimuli response response_rt block ExperimentName
## 1 402 45 j 109 block6 Assim_main_chinese_500ms
## 2 402 33 f 150 block6 Assim_main_chinese_500ms
## 3 402 21 f 52 block6 Assim_main_chinese_500ms
## 4 402 33 f 107 block5 Assim_main_chinese_500ms
## 5 402 33 f 153 block5 Assim_main_chinese_500ms
## 6 402 21 f 95 block2 Assim_main_chinese_500ms
head(filter(df_new, response_rt > 200))
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 block6 Assim_main_chinese_500ms
## 2 402 315 j 2831 block6 Assim_main_chinese_500ms
## 3 402 33 f 1041 block6 Assim_main_chinese_500ms
## 4 402 315 j 363 block6 Assim_main_chinese_500ms
## 5 402 21 f 1234 block6 Assim_main_chinese_500ms
## 6 402 45 j 322 block6 Assim_main_chinese_500ms
6.3 Changing the order of rows
# arrange the dataframe by stimuli and response_rt columns
head(arrange(df_new, stimuli, response_rt))
## subject stimuli response response_rt block ExperimentName
## 1 418 21 g 6 block7 assim_main_vietnamese_500ms
## 2 418 21 g 51 block4 assim_main_vietnamese_500ms
## 3 402 21 f 52 block6 Assim_main_chinese_500ms
## 4 418 21 g 85 block1 assim_main_vietnamese_500ms
## 5 402 21 f 95 block2 Assim_main_chinese_500ms
## 6 418 21 g 99 block7 assim_main_vietnamese_500ms
# arrange the dataframe by stimuli and the descending order of block columns
head(arrange(df_new, stimuli, desc(block)))
## subject stimuli response response_rt block ExperimentName
## 1 402 21 f 609 block7 Assim_main_chinese_500ms
## 2 402 21 f 224 block7 Assim_main_chinese_500ms
## 3 402 21 f 286 block7 Assim_main_chinese_500ms
## 4 402 21 f 1136 block7 Assim_main_chinese_500ms
## 5 418 21 g 6 block7 assim_main_vietnamese_500ms
## 6 418 21 g 331 block7 assim_main_vietnamese_500ms
6.4 Generating new variables
We can generate new variables based on existing variables using the function mutate() and transmutate(). The difference is mutate will keep the original variables while transmutate will erase orginal variables.
# you can add, minus, log transform any numberic column
head(mutate(df_new, response_rt_new = log(response_rt)))
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 block6 Assim_main_chinese_500ms
## 2 402 315 j 2831 block6 Assim_main_chinese_500ms
## 3 402 33 f 1041 block6 Assim_main_chinese_500ms
## 4 402 315 j 363 block6 Assim_main_chinese_500ms
## 5 402 21 f 1234 block6 Assim_main_chinese_500ms
## 6 402 45 j 322 block6 Assim_main_chinese_500ms
## response_rt_new
## 1 6.510258
## 2 7.948385
## 3 6.947937
## 4 5.894403
## 5 7.118016
## 6 5.774552
# you can extract part of the information from a character variable
head(mutate(df_new, ISI = str_extract(ExperimentName, "2000|500")))
## subject stimuli response response_rt block ExperimentName ISI
## 1 402 33 f 672 block6 Assim_main_chinese_500ms 500
## 2 402 315 j 2831 block6 Assim_main_chinese_500ms 500
## 3 402 33 f 1041 block6 Assim_main_chinese_500ms 500
## 4 402 315 j 363 block6 Assim_main_chinese_500ms 500
## 5 402 21 f 1234 block6 Assim_main_chinese_500ms 500
## 6 402 45 j 322 block6 Assim_main_chinese_500ms 500
head(transmute(df_new, ISI = str_extract(ExperimentName, "2000|500")))
## ISI
## 1 500
## 2 500
## 3 500
## 4 500
## 5 500
## 6 500
# you can paste two variabls together
head(transmute(df_new, ISI = paste(stimuli, block, sep = "_")))
## ISI
## 1 33_block6
## 2 315_block6
## 3 33_block6
## 4 315_block6
## 5 21_block6
## 6 45_block6
# you can recode a variable
head( mutate(df_new, block = recode(block,
block1 = "ss", block2 = "ss",
block3 = "sd", block4 = "sd",
block5 = "ds", block6 = "ds",
block7 = "dd")))
## subject stimuli response response_rt block ExperimentName
## 1 402 33 f 672 ds Assim_main_chinese_500ms
## 2 402 315 j 2831 ds Assim_main_chinese_500ms
## 3 402 33 f 1041 ds Assim_main_chinese_500ms
## 4 402 315 j 363 ds Assim_main_chinese_500ms
## 5 402 21 f 1234 ds Assim_main_chinese_500ms
## 6 402 45 j 322 ds Assim_main_chinese_500ms
6.5 The pipeline “%>%”
You may have noticed the repetition in the above codes. We have to specify the dataframe every time we use a function and have to store it in a new name if we want to keep the original dataframe.
Using pipeline %>% can reduce the redundancy and the mistake we may make when we have a dozen of dataframes to refer to.
df_final = df %>%
# selecting the columns we need
select(., subject = "Subject",
stimuli = "tone.Trial.",
response = "insex1.RESP",
response_rt = "insex1.RT",
block = "Procedure.Block.",
exp = ExperimentName)%>%
#filtering out useless data
filter(block != "pracproc" & !is.na(response) & response != "")%>%
# generating new variables based on old variables
mutate(ISI = str_extract(exp, "2000|500"),
block = recode(block,
block1 = "ss", block2 = "ss",
block3 = "sd", block4 = "sd",
block5 = "ds", block6 = "ds",
block7 = "dd"))
6.6 Summarizing
Summarize() and group_by () are often used together to give us some basic summary of the data.
df_final%>%
group_by(subject)%>%
summarize(count = n())
## # A tibble: 2 x 2
## subject count
## <int> <int>
## 1 402 221
## 2 418 139
df_final%>%
group_by(subject, stimuli)%>%
summarize(rt = mean(response_rt))
## # A tibble: 10 x 3
## # Groups: subject [2]
## subject stimuli rt
## <int> <int> <dbl>
## 1 402 21 747.
## 2 402 33 688.
## 3 402 45 831.
## 4 402 241 747.
## 5 402 315 787.
## 6 418 21 578.
## 7 418 33 553.
## 8 418 45 574.
## 9 418 241 715.
## 10 418 315 694.
df_final%>%
group_by(stimuli, response)%>%
mutate(counter = 1)%>%
summarize(count = sum(counter))%>%
spread(stimuli, value = count)
## # A tibble: 5 x 6
## response `21` `33` `45` `241` `315`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 f 24 21 1 12 NA
## 2 j NA NA 42 1 15
## 3 d 1 11 NA 49 NA
## 4 g 54 45 NA 6 NA
## 5 h NA NA 23 NA 55
df_final%>%
group_by(stimuli,response)%>%
mutate(counter = 1)%>%
summarize(counter = sum(counter))%>%
mutate( percentage = counter/sum(counter),
sum = sum(counter))
## # A tibble: 15 x 5
## # Groups: stimuli [5]
## stimuli response counter percentage sum
## <int> <fct> <dbl> <dbl> <dbl>
## 1 21 f 24 0.304 79
## 2 21 d 1 0.0127 79
## 3 21 g 54 0.684 79
## 4 33 f 21 0.273 77
## 5 33 d 11 0.143 77
## 6 33 g 45 0.584 77
## 7 45 f 1 0.0152 66
## 8 45 j 42 0.636 66
## 9 45 h 23 0.348 66
## 10 241 f 12 0.176 68
## 11 241 j 1 0.0147 68
## 12 241 d 49 0.721 68
## 13 241 g 6 0.0882 68
## 14 315 j 15 0.214 70
## 15 315 h 55 0.786 70
table.final = df_final%>%
group_by(stimuli,response)%>%
mutate(counter = 1)%>%
summarize(counter = sum(counter))%>%
mutate( percentage = round(counter/sum(counter),2),
sum = sum(counter))%>%
select(stimuli,percentage, response)%>%
spread(stimuli, value = percentage)
library(knitr)
kable(table.final, caption = "Percentage of choice")
response | 21 | 33 | 45 | 241 | 315 |
---|---|---|---|---|---|
f | 0.30 | 0.27 | 0.02 | 0.18 | NA |
j | NA | NA | 0.64 | 0.01 | 0.21 |
d | 0.01 | 0.14 | NA | 0.72 | NA |
g | 0.68 | 0.58 | NA | 0.09 | NA |
h | NA | NA | 0.35 | NA | 0.79 |