# This script was made by Joshua Balsters at Gorilla to organise data from the SART Gorilla Academy course. 16/02/2021

#load the packages with functions you'll need for data processing and analysis
library(dplyr)
library(stringr)
library(ggstatsplot)

# clear environment
rm(list=ls())

# set working directory
setwd("~/Dropbox (Cauldron)/Cauldron Team Folder/Sales and Marketing/Research Methods Course/Materials/SART/analysis/data_SART_exp")

# load data files
Fixed<-read.csv("data_exp_42207-v5_task-rara.csv",header=TRUE,sep=",")
Random<-read.csv("data_exp_42207-v5_task-95m9.csv",header=TRUE,sep=",")

task_data<-data.frame() # create empty variable for data
task_dataF<-data.frame() # create empty variable for data
task_dataR<-data.frame() # create empty variable for data


for (j in 1:2) { # Loop across task nodes (Fixed and Random SART)
  
  if (j==1) { #in the first loop use the data from the Fixed SART
    data<-Fixed
    condition<-"Fixed"
  } else if (j==2) { #in the second loop use the data from the Random SART
    data<-Random
    condition<-"Random"  
  }
  
  # find unique IDs from task.csv
  IDs<-unique(data$Participant.Private.ID)
  IDs<-IDs[!is.na(IDs)] # sometimes last value of IDs is NA so this removes that from vector
  ID_length<-length(IDs) # how many participants are there in the data 
  
  for (i in 1:ID_length) { # loop through each participant
    
    # select an ID code from the list of participants and show it in Console
    currentID<-IDs[i]
    print(currentID)
    
    # filter out single participant
    SS_data<-filter(data,data$Participant.Private.ID==currentID)
    
    # make a note of which group the participant is from
    group<-SS_data$Participant.Starting.Group[1]
    
    # Create a pipe to 1) remove columns you don't need, 2) only keep Zone Type responses and 3) trial type task, 4) only keep the first response in each trial 
    SS_data<-SS_data %>%
      select(Participant.Starting.Group,Participant.Private.ID,Screen.Name,Zone.Type,Reaction.Time,Response,display, type, number,Trial.Number) %>%
      filter(str_detect(Zone.Type, "response")) %>%
      filter(type=="task") %>% 
      distinct(Trial.Number, .keep_all = TRUE)
    
    SS_data$Response <- as.factor(SS_data$Response) # change the data to be a factor type so you can use summary function to count the number of correct and incorrect trials
    blah<-summary(SS_data$Response)
    
    # if the participant made no correct/incorrect responses summary will give you NA rather than 0
    if (is.na(blah["correct"])) { # if correct is NA make the value of OE 200 because they missed all 200 required responses to numbers 1-2,4-9
      OE<-200
    } else { #otherwise ommision errors = total trials (200) - correct responses
      OE<-200-blah["correct"] 
    }
    
    if (is.na(blah["incorrect"])) { # if the participant made no incorrect responses make commission errors 0
      CE<-0
    } else { #otherwise make it the number of incorrect responses
      CE<-blah["incorrect"] 
    }
    
    #add the results to either the Fixed or Random SART dataset
    if (j==1) {
      task_dataF[i,1:4]<-c(currentID,group,OE,CE)
    } else if (j==2) {
      task_dataR[i,1:4]<-c(currentID,group,OE,CE)
    }
  }
  }
  
#label your columns so you know what they are 
  names(task_dataF)[1] <- "ID"
  names(task_dataF)[2] <- "Group"
  names(task_dataF)[3] <- "FixedOE"
  names(task_dataF)[4] <- "FixedCE"
  
  names(task_dataR)[1] <- "ID"
  names(task_dataR)[2] <- "Group"
  names(task_dataR)[3] <- "RandomOE"
  names(task_dataR)[4] <- "RandomCE"
  
  # merge combines 2 datasets using a common key, in this case ID
  task_data <- merge(task_dataF,task_dataR,by="ID")
  
  # when you do stats in R then variables need to be correctly specified as numbers, characters, factors etc. Use str() to see what each variable type is
  # change these variables to be factors
  task_data$ID<-as.factor(task_data$ID)
  task_data$Group.x<-as.factor(task_data$Group.x)
  task_data$Group.y<-as.factor(task_data$Group.y)
  
  #change these variables to be numbers
  task_data$FixedOE<- as.integer(task_data$FixedOE)
  task_data$FixedCE<- as.integer(task_data$FixedCE)
  task_data$RandomOE<- as.integer(task_data$RandomOE)
  task_data$RandomCE<- as.integer(task_data$RandomCE)
  
  write.csv(task_data, file="data_rstats.csv") # save the data as a csv file for analysis in another package
  
  # Use the ggstatsplot between subject stats to test for differences between groups - see the documentation for more details
  # Fixed CE plot
  ggstatsplot::ggbetweenstats(
    data = task_data,
    x = Group.x,
    y = FixedCE,
    title = "Fixed SART commision errors by Group",
    type = "np" # Based on previous analysis I know the data fails Levene's test so I've set the analysis to use non-parametric stats, you can also change this to use parametric, robust, or Bayesian stats
  )
  
  # Random CE plot
  ggstatsplot::ggbetweenstats(
    data = task_data,
    x = Group.x,
    y = RandomCE,
    title = "Random SART commision errors by Group",
    type = "np"
  )