##########################################
### Week 4 Lab:Dichotomous predictors ####
####   Friday, September 29th, 2017   ####
##########################################

########################################################
#### Download lab files into your working directory ####
########################################################
library(lmSupport)

# We're using the same bias dataset from previous weeks 
# Set your working directory

####Cool R studio trick####

d = dfReadDat("")
# hit tab between two double quotes to open a file explorer#####

d = dfReadDat("Lab4_BiasData.dat") 

# Check out the dataset
varDescribe(d) # Always look at descriptives
head(d) # first few rows
# Again, you could  achieve the same thing as head() using bracket notation?
d[1:6,]

#########################################################
#### Some data manipluation  ##########
#########################################################
View(d)

# Last week we ran some analysis on the continuous variable "Trainhours". While this informed us 
# about the effect of number of training hours on IAT scores, it did not necessarily 
# tell us if our specific intervention (indicated with the "Condition" variable) had an effect.
# In fact, let's look at "Trainhours" and "Condition" variable side by side. 

# How can we tell r to show us just the Condition and Trainhours variables?
cbind(d$Condition, d$Trainhours) 

# It looks like people in our no training condition actually received bias training from another source
# As such, it's possible we will get a different result when we look at our study's manipulation specifically.

# Last week, a couple of you noted that they found the fact that people not in our training condition 
# had non 0 training hours to be a bit confusing. So what if we find this Trainhours variable distracting? 
# For that matter, there are also other variables in our dataset that we know we don't need this week
# How would you only take the variables you care about using bracket notation?

names(d)

ColumnNames = c('Condition', 'BaseIAT' ,'Wk4IAT') #Create a list of column names you want

d = d[,ColumnNames] # index into just those column names and return only those variables back into d 

# This would also work but what is wrong with doing it this way?
d = d[,c(1,6:8)]
# **You're prone to make errors. Imagine what would happen if you added a new column to the dataframe
# without changing this code.

# Now that we have those variables out of the way, here are some other checks we may want to do:
# Remember, we can look at how many people were in each condition using table

table(d$Condition)

# This also tells us how the variable is currently coded. This type of coding is called
# "dummy coding". The non training group is the reference condition. Generally, 
# you want to code your regressors in a way that allows you to answer your specific research
# question. 
# We will deal with zero centered contrast coding in the lab exercise that we will do later. 


####################################################################
#### Fitting a two-parameter model with a dichotomous predictor ####
####################################################################
# Now we want to test whether Baseline IAT scores significantly 
# differed by condition (Condition: Bias Training = 1, No Bias Training = 0). In other words, 
# since participants in both groups already had some training hours coming in, are IAT scores already different
# between the groups BEFORE our manipulation? Would this be a good thing or a bad thing for our study?

# ...it's a bad thing

# OK, let's look at this with the parameter testing approach.

m1 = lm(BaseIAT ~ 1 + Condition, data=d)
modelSummary(m1)

modelEffectSizes(m1)
confint(m1)

# From class: what do we expect the modelR2 to be in this model? 
modelR2(m1) 
# **it is the same as the partial eta square for condition, as condition is the only
# predictor variable in the model.

# How would you interpret b0 and b1? What's the conclusion?

# b0 represents the average BaseIAT for people in the control group (because for them 
# Condition = 0). People in the control group have IAT scores significantly above 0.
# b1 represents the effect of condition (i.e., change in IAT scores as you move from
# control to experimental conditions) on baseline IAT. This effect is non-significant, 
# which means we cannot conclude the groups differed on IAT scores at baseline (this
# is good).

# Very soon we will teach you a method for dealing with baseline differences, if you find them.


#################################
#### Plotting using barplots ####
#################################
# Although we could technically use a line graph to display the results,
# the general convention is to create a barplot instead of a scatterplot.

pY2 = data.frame(Condition = c(0, 1)) # This is the same idea from last week,
# but now we need only 2 values to represent all possible values on condition variable.

# need to generate the mean of predicted values for each condition
# also need lower and upper standard error bounds for each condition
# SE bars represent standard error of point estimates
pY2 = modelPredictions(m1, pY2)
pY2

# Note how the predicted values are just the means within each group
varDescribeBy(d$BaseIAT, d$Condition) 

# Starting plot
plotB = ggplot(pY2, aes(x = Condition, y = Predicted))
plotB # correct axes? Good ranges? yup

# Add bars
plotB = plotB + geom_bar(mapping = aes(fill = Condition), # add colors and labels based on condition
                         data = pY2, 
                         stat = "identity", # using stat = "identity" because bars represent particular values in dataset
                         width = 0.5) 
plotB

# Why is the legend a gradient? R thinks we could theoretically have values in between 0
# and 1, but obviously we cannot. You can fix this by telling R to treat condition as
# a factor (a fancy word for categorical variable) instead of a number:
plotB = ggplot(pY2, aes(x = Condition, y = Predicted)) + # just to re-initialize it
  geom_bar(mapping = aes(fill = as.factor(Condition)), 
           data = pY2, 
           stat = "identity", 
           width = 0.5) 
plotB

# add the raw data points with jittering so we can see them
plotB = plotB + geom_point(data = d, aes(y = Wk4IAT, x = Condition),
                           colour='darkgrey',
                           position = position_jitter(w = 0.1, h = 0.1))
plotB

# add error bars
plotB = plotB + geom_errorbar(data = pY2, width=.25, # add error bars of width 0.25
                              aes(y = Predicted, x = Condition, # set error bar aesthethics: x & y variables
                                  ymin = CILo, # define bottom (then top) of error bars
                                  ymax = CIHi), stat="identity", width=0.75) # heights of bars rep values of data
plotB

# finally, add labels, a title and remove the legend since it is unnecessary
plotB = plotB +labs(y = 'Baseline IAT Score', x = 'Condition') + # set axis labels
  theme_bw(base_size = 14) +  # remove the background
  theme(legend.position="none") # remove the unnecessary legend
plotB

# Everything at once (combining previous code into single plot)
plotB2 = ggplot(d, aes(x = Condition, y = Predicted)) +
  geom_bar(mapping = aes(fill = as.factor(Condition)), data = pY2, stat = "identity", width = 0.5) +
  geom_point(data = d, aes(y = Wk4IAT, x = Condition),colour='darkgrey', 
             position = position_jitter(w = 0.1, h = 0.1)) +
  geom_errorbar(data = pY2, width=.25, aes(y = Predicted, x = Condition, ymin = CILo, 
                                           ymax = CIHi), stat="identity", width=0.75) + 
  labs(y = 'Baseline IAT Score', x = 'Condition') + theme_bw(base_size = 14) + theme(legend.position="none") 
plotB2

# If you were preparing this graph for publication, what else would you want to do?
# Better tick marks on y axis
# Highlight the 0 line on the y axis more, as it's conceptually meaningful
# move the bars closer together
# change the labels for the condition variable (x axis)
# maybe remove grid lines


############################################
## Begin lab exercise in small groups ######
############################################