# This script was made by Joshua Balsters at Gorilla to analyse data from the Language Gorilla Academy course. 28/10/2020

library(dplyr)
library(stringr)
library (stringdist)
library(ggplot2)
library(lme4)
library(lmtest)

# clear environment
rm(list=ls())

# set working directory
setwd("~/Dropbox (Cauldron)/Cauldron Team Folder/Sales and Marketing/Research Methods Course/Materials/Language/analysis_code/data_exp_25898-v5")


################# step 1 - analysis of raw data  #################

# load data files
data<-read.csv("data_exp_25898-v5_task-54o6.csv",header=TRUE,sep=",")

# select relevant fields and filter to only include responses
data<-data %>%
  select(Participant.Private.ID, Zone.Type, Reaction.Time, Response, ANSWER, Correct, Type, Condition, Noise, Video) %>%
  filter(Zone.Type == "response_text_entry")

data$Video <- str_replace_all(data$Video, "_-12.mp4", "")
data$Video <- str_replace_all(data$Video, "_-12a.mp4", "")
data$Video <- str_replace_all(data$Video, "_a.mp4", "")
data$Video <- str_replace_all(data$Video, ".mp4", "")

# What does the data look like? Notice that it thinks RTs are character strings... we need to change the data types
str(data)

data$Participant.Private.ID<-as.factor(data$Participant.Private.ID)  #make a factor
data$Reaction.Time<-as.numeric(data$Reaction.Time) #make a number
data$Type<-as.factor(data$Type) #make a factor
data$Condition<-as.factor(data$Condition)
data$Noise<-as.factor(data$Noise)
data$Video<-as.factor(data$Video)
data$Correct<-as.factor(data$Correct)
levels(data$Correct) <- c("Incorrect", "Correct") #change labels of factors

names(data)[1]<- "ID" #rename first column Participant.Private.ID to ID


# First analysis using Generalised linear mixed effect model on original data
# Type and Condition are fixed effects, participants and words are random effects

data_filter <- filter(data, data$Noise=="Y") #only include noisy trials

gm_orig <- glmer(Correct  ~ Type * Condition + (1|ID) + (1|Video),
                   family=binomial,
                   data=data_filter) #run Generalised linear mixed effect model
# show results
print(summary(gm_orig))

# The results show a very significant effect of Type (AV more accurate than A) but let's plot the data
# first we need to calculate mean and SEM
mean_plot <- data_filter %>%
  group_by(Condition, Type) %>%
  mutate( Correct_plot = recode(Correct, 
                                 "Incorrect" = 0,
                                 "Correct" = 1)) %>%
  summarise(
    mean=mean(Correct_plot, na.rm=TRUE),
    SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot)))

# rough bar plot of data using GGplot
colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3')

ggplot(mean_plot) +
  geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean),  alpha=0.6, stat='identity', fill=colors) +
  geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) +
  ylim(0,1) + ggtitle("Raw data")

################# step 2 - start processing text data ################# 

# convert all responses on lower case
data$Response2 <- str_to_lower(data$Response)
# keep only alphanumeric characters
data$Response2 <- str_replace_all(data$Response2, "[^[:alnum:]]", "")

#compare responses to answers and create new field called correct2
data$Correct2 <- data$Response2==data$ANSWER
data$Correct2 <- as.numeric(data$Correct2)
data$Correct2 <- as.factor(data$Correct2)
levels(data$Correct2) <- c("Incorrect", "Correct")

# filter data to only include Noise trials
data_filter <- filter(data, data$Noise=="Y")

# Model Type and Condition as fixed effects, participants and words and random effects
gm1 <- glmer(Correct2  ~ Type * Condition + (1|ID) + (1|Video),
             family=binomial,
             data=data_filter)
# show results
print(summary(gm1))

# The results show a very significant effect of Type (AV more accurate than A)
mean_plot <- data_filter %>%
  group_by(Condition, Type) %>%
  mutate( Correct_plot = recode(Correct2, 
                                "Incorrect" = 0,
                                "Correct" = 1)) %>%
  summarise(
    mean=mean(Correct_plot, na.rm=TRUE),
    SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot)))

#rough bar plot of data
colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3')

ggplot(mean_plot) +
  geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean),  alpha=0.6, stat='identity', fill=colors) +
  geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) +
  ylim(0,1) + ggtitle("Basic correction")


################# step 3  - run string distance calculation to allow for 1 mistake in text entry ################# 
# create variable patterns for each of the answers
patterns <- c("knock","mature", "module","moth", 
              "chief", "hash", "those","vacuum")

# create a matrix of 0's 8820 rows for the data and 8 columns for the answers NOTE this approach lead to a bug see bonus footage of video for discussion
#x=matrix(rep(0,8820*8), nrow=8820)

#for (i in 1:8) {
#  x[i]<-grabl(data_stringdist$Response2,pattern[i], method = "dl", maxDist=1)
#}

# loop comparing string distance of responses to each of the answers
for (i in 1:8) {
  pattern_index<-patterns[i]
  data_stringdist<-filter(data, data$ANSWER==pattern_index)
  data_stringdist$Correct3 <- grabl(data_stringdist$Response2,pattern_index, method = "dl", maxDist=1) # Here I used dl algorithm but you may want to try others
  
  if (i==1) {
    data_total<-data_stringdist
  } else {
    data_total<-rbind(data_total,data_stringdist)
  }
}

# add values of columns so there's a single value for correct/incorrect
data_total$Correct3<-as.factor(data_total$Correct3)
levels(data_total$Correct3) <- c("Incorrect", "Correct")

# filter data to only include Noise trials
data_filter <- filter(data_total, data_total$Noise=="Y")


# Model Type and Condition as fixed effects, participants and words and random effects
gm2 <- glmer(Correct3  ~ Type * Condition + (1|ID) + (1|Video),
                   family=binomial,
                   data=data_filter)
# show results
print(summary(gm2))

# summarise the results and plot the data
mean_plot <- data_filter %>%
  group_by(Condition, Type) %>%
  mutate( Correct_plot = recode(Correct3, 
                                "Incorrect" = 0,
                                "Correct" = 1)) %>%
  summarise(
    mean=mean(Correct_plot, na.rm=TRUE),
    SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot)))

#rough bar plot of data
colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3')

ggplot(mean_plot) +
  geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean),  alpha=0.6, stat='identity', fill=colors) +
  geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) +
  ylim(0,1) + ggtitle("Advanced correction")

anova(gm_orig,gm1)
anova(gm1,gm2)

################# step 4 - remove bad subjects ################# 
# make a copy of the data
check <- data_total

# As in Karas et al filter data for AV trials without noise and calculate accuracy
check <- check %>%
  filter(Noise == "N" & Type=="AV") %>%
  group_by(ID) %>%
  mutate( Correct_plot = recode(Correct2, 
                                "Incorrect" = 0,
                                "Correct" = 1)) %>%
  summarise(mean_corr = mean(Correct_plot, na.rm=TRUE)) %>%
  mutate(good_subj = mean_corr>0.75) # set a limit of accuracy greater than 75%

good_subj = check$ID[check$good_subj==TRUE] # make a new variable with IDs of subjects with >75% accuracy
data2_filter <- data_filter %>% subset((.)$ID %in% good_subj) #only keep good subjects

# write out CSV file
write.csv(data2_filter, file="all_data_28102020.csv")

# run analysis again on new data
gm2a <- glmer(Correct3  ~ Type * Condition + (1|ID) + (1|Video),
                   family=binomial,
                   data=data2_filter) # change data here
print(summary(gm2a))

# The results show a very significant effect of Type (AV more accurate than A)
mean_plot <- data2_filter %>%
  group_by(Condition, Type) %>%
  mutate( Correct_plot = recode(Correct3, 
                                "Incorrect" = 0,
                                "Correct" = 1)) %>%
  summarise(
    mean=mean(Correct_plot, na.rm=TRUE),
    SEM = sd(Correct_plot,na.rm=TRUE) / sqrt(length(Correct_plot)))

#rough bar plot of data
colors <- c('orange', 'dodgerblue3','orange', 'dodgerblue3')

ggplot(mean_plot) +
  geom_bar( aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), y=mean),  alpha=0.6, stat='identity', fill=colors) +
  geom_linerange(aes(x=c("ML_A","ML_AV","VL_A","VL_AV"), ymin=mean-SEM, ymax=mean+SEM), size = 2) +
  ylim(0,1) + ggtitle("Advanced subj remove correction")