##########################################
#### Week 3 Lab: Two Parameter Models ####
####   Friday, September 22nd, 2016   ####
##########################################

########################################################
#### Download lab files into your working directory ####
########################################################
library(lmSupport)

# Set your working directory

# We're using the bias dataset from last week
d = dfReadDat("Lab3_BiasData.dat") 

# Check out the dataset
varDescribe(d) # Always look at descriptives
head(d) # first few rows
# How could you achieve the same thing as head() using bracket notation?
d[1:6,]

# some() lives in a package we don't really want to load (for only one function? why?)
# We can also use bracket notation to do the same thing as some(). Let's use google.
# "Get random elements of vector R"
?sample
sample(c('pig','cow','chicken'), 2)
d[sample(1:62,6),] # we're very smart!


###########################################
#### Read page 1 of Lab3_Exercise.docx ####
###########################################

# What's the experiment about? Testing the effectiveness of an intervention to reduce implicit bias.
# What is the independent variable? Concern about discrimination against blacks.
# What is the dependent variable? Implicit bias as measured by the IAT.

# Prediction: Participants' concern about discrimination in society should predict
# their week 4 IAT scores, such that participants who are more concerned
# about discrimination should have lower (less positive) IAT scores.
# (Positive scores on this IAT indicate implicit preference for White over Black).

# In other words: Does concern score explain variance in week 4 IAT scores?


################################################################
#### What models should we compare to test this hypothesis? ####
################################################################

# DATA = MODEL + ERROR
# Model C: Y = b0 + e  OR  Wk4IAT = b0 + e
# Model A: Y = b0 + b1*X1 + e  OR  Wk4IAT = b0 + b1*ConcernM + e


###################################
#### Prepare data for analysis ####
###################################

# We have four concern items; need to create average score for each participant
# Which of the items are reverse-coded?
Concern = c('Concern1','Concern2','Concern3','Concern4')

library ('psych')
psych::alpha(d[,Concern], check.keys = TRUE)
# It would appear as though the reverse-scored item from last week (4) has been adjusted already.

d$ConcernM = varScore(d,
                       Forward= Concern,  
                       Reverse= NULL, # this will default to null, but is added for reference here
                       Range=c(1, 10),
                       Prorate=TRUE,
                       MaxMiss=0.3) / 4

# Why varScore and not rowMeans?

#1.You can use rowmeans (and set na.rm = true) and it will give you a score but you wouldn't want this if the person for example only answered 1 of 4 questions.  That person SHOULD be set to missing b/c there is not enough data to prorate.    
#In varScore, you can set the threshold for max percent missing.  In rowmeans, you either get missing if any missing (na.rm=false) or no missing unless all items missing (na.rm = true).  
#That is a big problem
#2. If you are using sums rather than means as many scales require, the sum will be wrong if items are missing (it will NOT prorate). 
#This doesn't technically apply to rowmeans but would to rowsums which is the summing function that one might use to create a sum rather than mean score.
#varScore handles all this. varScore also doesn't require that you manually recode items (which can be a source of errors).  
#If you are using Rowmeans you would first need to make sure that all items have been recoded

#varScore also does some error checking (checks that values are in the right range).  RowMeans does not do this


################################################
#### The Compact Model: One Parameter Model ####
################################################
# Fit a one-parameter model
# What is our one parameter? The sample mean on Week4 IAT
mC = lm(Wk4IAT ~ 1, data=d) 

# We can ask for the values of y that are predicted by our model
d$fvC = modelPredictions(mC)
d$fvC #report predicted values
# What is the number we're predicting for everyone? Again, the sample
# mean of Week4 IAT.

# We can ask for the residuals
d$eC = modelErrors(mC)  # Report errors for each subject
d$eC

# See that the predicted values (our model) plus the error equals 
# the data themselves!
cbind(d$Wk4IAT, d$fvC + d$eC) #first column is the data, second column is predicted values + errors
# And we can look at the coefficients, or parameter estimates themselves
coef(mC)  #report model coefficients (parameter estimates)

# If we want to ask questions about model fit, we need to calculate SSE:
SSEC = sum(modelErrors(mC)^2)
SSEC

# Does this value alone tell us if the model fits the data well?
# While just seeing the some of squred errors can provide us some minimal information,
# we don't know how good this model is compared to any other model we might estimate.


##################################################
#### The Augmented Model: Two Parameter Model ####
##################################################
# Fit a two-parameter model
# What is our second parameter? The participant's concern score
mA = lm(Wk4IAT ~ 1 + ConcernM, data=d) 

# We can ask for the values of y that are predicted by our model
d$fvA = modelPredictions(mA)  #report predicted values
d$fvA # these now vary based on concern score

# We can also ask for the errors
d$eA = modelErrors(mA)  # Report errors for each subject
d$eA

# And we can ask for just the coefficients, or parameter estimates themselves
coef(mA)  #report model coefficients (parameter estimates)

# Week4IAT = Y = b0 + b1*Concern
# Week4IAT = 0.6985 + -0.041*Concern

# If we want to ask questions about model fit, we need to calculate SSE:
SSEA = (sum(d$eA^2))
SSEA

# Model Comparison
varDescribe(d)
n = 78 # number subjects/observations
dfN = 2 - 1 # 2 parameters in Model A, 1 parameter in Model C
dfD = n - 2 # 2 parameters in Model A
FStat = ((SSEC - SSEA)/dfN) / (SSEA/dfD)
FStat
pf(FStat, dfN, dfD, lower.tail=FALSE)

# What does the p-value tell us? (generally, not this specific p)

# p-Value: Probability of obtaining this observed F-stat (or a more extreme one) 
# in a random sample of n = 78 from the population, assuming the null hypothesis is true

modelCompare(mC,mA)  #directly compares the models and calculates F-statistic

modelSummary(mA, t=FALSE) # How is this related to what we just did?
# Remember: R is giving us the results of two different model comparisons.
# Each line in the summary is associated with a different set of comparisons.
# What is the interpretation of the second coefficient, b1? What two models are being compared?
# The effect of concern on Week4 IAT scores. Being compared to the mean-only model.
# What is the interpretation of the first coefficient, b0? What two models are being compared?
# The predicted IAT score for someone with concern = 0. Being compared to a model making
# predictions based on concern score alone.


coef(mC) # Coefficients from Model C
# Why is b0 different between Model A and Model C?
# The regression line has a slope now, so it won't cross the y axis at the mean. If we centered
# mean concern and ran a new model, b0 would be the same as in our first model.

# Confidence Intervals
confint(mA)  #confidence intervals for b0 and b1

# Effect Sizes
# Calculate PRE/ partial eta squared for b1
(SSEC - SSEA)/SSEC

# Or just use modelEffectSizes() to get partial eta-squared
modelEffectSizes(mA)
# What does partial eta squared represent?
# it represents how much (percent/proportion) error is reduced in the augmented model relative
# to the compact model.  In this case that reduction is error is due to adding the condition variable


#####################################
#### Graphing our Data and Model ####
#####################################
# You will encounter us using the terms "quick and dirty" plot and "publication quality" plot.
# Generally speaking, when we use the latter, we mean ggplot. Quick and dirty plots are more
# for your understanding and ability to look at the data visually.

# Based on last week:
plot(d$ConcernM, d$Wk4IAT)
abline(lm(d$Wk4IAT~d$ConcernM))

# Generating regression line with CI bands using the effects package
library(effects)  #used for quick and dirty view of models.  VERY IMPORTANT.
plot(effect('ConcernM', mA))

# Why are CI bands not linear?
# Because we're more confident in our predictions in the ranges of the data where we have more
# observations.
# But are these the error bands we want? It’s unclear. This is why you should never use the ‘plot’
# command for anything other than giving you an idea of what the relationship looks like.

# Publication quality graphing

# Load ggplot2
library(ggplot2)
# From Help: ggplot() is typically used to construct a plot incrementally, 
# using the + operator to add layers to the existing ggplot object. This is 
# advantageous in that the code is explicit about which layers are added and 
# the order in which they are added. For complex graphics with multiple layers, 
# initialization with ggplot is recommended.

# Generating predicted data (necessary for confidence interval bands)
# creating data frame for predictor values, first two numbers are range of predictor
pY = data.frame(ConcernM=seq(1,10,length=100))
View(pY) # what is this?
# A dataframe containing just one variable, ConcernM, representing many of the possible
# values of ConcernM.

# use modelPredictions() to get standard error of Y-hats
pY = modelPredictions(mA,pY)
View(pY) #check this out
# This added predictions, lower and upper bound confidence interval scores, and SE columns

# Graph a scatterplot of the data with ConcernM on the x-axis and Wk4IAT on the y-axis
plotA = ggplot(d, aes(x = ConcernM, y = Wk4IAT)) # set the general parameters for the plot
plotA # ggplot interprets things in aes("...") as column names

# now we add laters to this plot as we go:
plotA = plotA + geom_point()
plotA # overwrite plotA with a new object that has all of the same stuff as plotA and points.

# Now let's add a layer to "plot" that will graph the regression line.
# Can we just use the default parameters? Nope. The data for the regression line must come
# from the pY dataframe so we can include confidence bands.
plotA = plotA + geom_smooth(aes(ymin = CILo, ymax = CIHi, x = ConcernM, y = Predicted),
                            data = pY,            # where to find these variables
                            stat = "identity",
                            color="red")          # we can make it a different color to stand out
plotA

# Finally, let's do just a couple things to make it look nice:
plotA = plotA + theme_bw(base_size = 14) + # remove the background grey color
  labs(x = 'Concern (Mean)', y = 'Week 4 IAT Score', title = 'Effect of Concern on IAT Scores')
plotA

# Everything at once (combining previous code into single plot)
plotA2 = ggplot(d, aes(x = ConcernM, y = Wk4IAT)) + 
  geom_point(data=d, aes(x = ConcernM, y = Wk4IAT), color = "black") + 
  geom_smooth(aes(ymin = CILo, ymax = CIHi, x = ConcernM, y = Predicted), 
              data = pY, stat = "identity", color="red") +
  theme_bw(base_size = 14) + 
  labs(x = 'Concern (Mean)', # indicate a label for the x-axis
       y = 'Week 4 IAT Score', title = 'Effect of Concern on IAT Scores')
plotA2

# If you were prepping this graph for publication, what else would you want to do?
# Possibly get rid of the grid lines
# Improve the tick marks on the x and y axis.
# Truncate the prediction line to stop at the lowest observed value of concern.
# Other things?

# We will learn how to do all these things later!


######################
#### Lab exercise ####
######################

# Do numbers 1-7 in small groups. Then stop, and we'll reconvene to do graphing.



#####################
#### Extra Stuff ####
#####################

# Calculate the SD of the errors
SE = sqrt(SSEA/dfD)
SE # Residual standard error (or standard error of the estimate)

# Correlation: when you want the relationship between two quantitative variables
cor(d$Wk4IAT,d$ConcernM)      # r - an effect size indicator (= the square root of partial eta squared)
cor.test(d$Wk4IAT,d$ConcernM)  # r has a p-value