# This script was made by Joshua Balsters at Gorilla to analyse data from the Language Gorilla Academy course. 28/10/2020 library(dplyr) library(stringr) library (stringdist) library(ggplot2) library(lme4) library(lmtest) # clear environment rm(list=ls()) # set working directory setwd("~/Dropbox (Cauldron)/Cauldron Team Folder/Sales and Marketing/Research Methods Course/Materials/Language/analysis_code/data_exp_25898-v5") ################# step 1 - analysis of raw data ################# # load data files data<-read.csv("data_exp_25898-v5_task-54o6.csv",header=TRUE,sep=",") # select relevant fields and filter to only include responses data<-data %>% select(Participant.Private.ID, Zone.Type, Reaction.Time, Response, ANSWER, Correct, Type, Condition, Noise, Video) %>% filter(Zone.Type == "response_text_entry") data$Video <- str_replace_all(data$Video, "_-12.mp4", "") data$Video <- str_replace_all(data$Video, "_-12a.mp4", "") data$Video <- str_replace_all(data$Video, "_a.mp4", "") data$Video <- str_replace_all(data$Video, ".mp4", "") # What does the data look like? Notice that it thinks RTs are character strings... we need to change the data types str(data) data$Participant.Private.ID<-as.factor(data$Participant.Private.ID) #make a factor data$Reaction.Time<-as.numeric(data$Reaction.Time) #make a number data$Type<-as.factor(data$Type) #make a factor data$Condition<-as.factor(data$Condition) data$Noise<-as.factor(data$Noise) data$Video<-as.factor(data$Video) data$Correct<-as.factor(data$Correct) levels(data$Correct) <- c("Incorrect", "Correct") #change labels of factors names(data)[1]<- "ID" #rename first column Participant.Private.ID to ID # First analysis using Generalised linear mixed effect model on original data # Type and Condition are fixed effects, participants and words are random effects data_filter <- filter(data, data$Noise=="Y") #only include noisy trials gm_orig <- glmer(Correct ~ Type * Condition + (1|ID) + (1|Video), family=binomial, data=data_filter) #run Generalised linear mixed effect model # show results print(summary(gm_orig)) # The results show a very significant effect of Type (AV more accurate than A) but let's plot the data # first we need to calculate mean and SEM mean_plot <- data_filter %>% group_by(Condition, Type) %>% mutate( Correct_plot = recode(Correct, "Incorrect" = 0, "Correct" = 1)) %>% summarise( mean=mean(Correct_plot, na.rm=TRUE), SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot))) # rough bar plot of data using GGplot colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3') ggplot(mean_plot) + geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean), alpha=0.6, stat='identity', fill=colors) + geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) + ylim(0,1) + ggtitle("Raw data") ################# step 2 - start processing text data ################# # convert all responses on lower case data$Response2 <- str_to_lower(data$Response) # keep only alphanumeric characters data$Response2 <- str_replace_all(data$Response2, "[^[:alnum:]]", "") #compare responses to answers and create new field called correct2 data$Correct2 <- data$Response2==data$ANSWER data$Correct2 <- as.numeric(data$Correct2) data$Correct2 <- as.factor(data$Correct2) levels(data$Correct2) <- c("Incorrect", "Correct") # filter data to only include Noise trials data_filter <- filter(data, data$Noise=="Y") # Model Type and Condition as fixed effects, participants and words and random effects gm1 <- glmer(Correct2 ~ Type * Condition + (1|ID) + (1|Video), family=binomial, data=data_filter) # show results print(summary(gm1)) # The results show a very significant effect of Type (AV more accurate than A) mean_plot <- data_filter %>% group_by(Condition, Type) %>% mutate( Correct_plot = recode(Correct2, "Incorrect" = 0, "Correct" = 1)) %>% summarise( mean=mean(Correct_plot, na.rm=TRUE), SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot))) #rough bar plot of data colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3') ggplot(mean_plot) + geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean), alpha=0.6, stat='identity', fill=colors) + geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) + ylim(0,1) + ggtitle("Basic correction") ################# step 3 - run string distance calculation to allow for 1 mistake in text entry ################# # create variable patterns for each of the answers patterns <- c("knock","mature", "module","moth", "chief", "hash", "those","vacuum") # create a matrix of 0's 8820 rows for the data and 8 columns for the answers NOTE this approach lead to a bug see bonus footage of video for discussion #x=matrix(rep(0,8820*8), nrow=8820) #for (i in 1:8) { # x[i]<-grabl(data_stringdist$Response2,pattern[i], method = "dl", maxDist=1) #} # loop comparing string distance of responses to each of the answers for (i in 1:8) { pattern_index<-patterns[i] data_stringdist<-filter(data, data$ANSWER==pattern_index) data_stringdist$Correct3 <- grabl(data_stringdist$Response2,pattern_index, method = "dl", maxDist=1) # Here I used dl algorithm but you may want to try others if (i==1) { data_total<-data_stringdist } else { data_total<-rbind(data_total,data_stringdist) } } # add values of columns so there's a single value for correct/incorrect data_total$Correct3<-as.factor(data_total$Correct3) levels(data_total$Correct3) <- c("Incorrect", "Correct") # filter data to only include Noise trials data_filter <- filter(data_total, data_total$Noise=="Y") # Model Type and Condition as fixed effects, participants and words and random effects gm2 <- glmer(Correct3 ~ Type * Condition + (1|ID) + (1|Video), family=binomial, data=data_filter) # show results print(summary(gm2)) # summarise the results and plot the data mean_plot <- data_filter %>% group_by(Condition, Type) %>% mutate( Correct_plot = recode(Correct3, "Incorrect" = 0, "Correct" = 1)) %>% summarise( mean=mean(Correct_plot, na.rm=TRUE), SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot))) #rough bar plot of data colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3') ggplot(mean_plot) + geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean), alpha=0.6, stat='identity', fill=colors) + geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) + ylim(0,1) + ggtitle("Advanced correction") anova(gm_orig,gm1) anova(gm1,gm2) ################# step 4 - remove bad subjects ################# # make a copy of the data check <- data_total # As in Karas et al filter data for AV trials without noise and calculate accuracy check <- check %>% filter(Noise == "N" & Type=="AV") %>% group_by(ID) %>% mutate( Correct_plot = recode(Correct2, "Incorrect" = 0, "Correct" = 1)) %>% summarise(mean_corr = mean(Correct_plot, na.rm=TRUE)) %>% mutate(good_subj = mean_corr>0.75) # set a limit of accuracy greater than 75% good_subj = check$ID[check$good_subj==TRUE] # make a new variable with IDs of subjects with >75% accuracy data2_filter <- data_filter %>% subset((.)$ID %in% good_subj) #only keep good subjects # write out CSV file write.csv(data2_filter, file="all_data_28102020.csv") # run analysis again on new data gm2a <- glmer(Correct3 ~ Type * Condition + (1|ID) + (1|Video), family=binomial, data=data2_filter) # change data here print(summary(gm2a)) # The results show a very significant effect of Type (AV more accurate than A) mean_plot <- data2_filter %>% group_by(Condition, Type) %>% mutate( Correct_plot = recode(Correct3, "Incorrect" = 0, "Correct" = 1)) %>% summarise( mean=mean(Correct_plot, na.rm=TRUE), SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot))) #rough bar plot of data colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3') ggplot(mean_plot) + geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean), alpha=0.6, stat='identity', fill=colors) + geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) + ylim(0,1) + ggtitle("Advanced subj remove correction")