################################################
####     Week 5 Lab: 2+ Predictor Models    ####
####       Friday, October 6th, 2017        ####
################################################

######################################################################################
####       The Prestige Dataset        ###############################################
######################################################################################

# Today we're going to use a dataset (Prestige). This dataset is one that comes
# included in R packages, so it doesn't need to be downloaded and read in with
# dfReadDat.

# How can we find out more about the Prestige dataset?
# First, we need to load one of two packages: car and effects
library(car) # we'll use this one since we are going to use this package anyway
?Prestige # Description of the Prestige dataset

# Copy Prestige dataset into an object called 'd' to prevent unnecessary typing.
d = Prestige

# Run some commands to get a sense of the data
# Use the help pane to get some more information about these variables
str(d)
head(d)
varDescribe(d)

# What does each row represent?
# Each row contains information on a particular occupation (given in the name of the row)

# You can list all the occupations:
rownames(d)

# Create a correlation table listing all of the pairwise correlations between the 4 variables
# we care about: 'income', 'education', 'women', and 'prestige'

cor(d[, c('income', 'education', 'women', 'prestige')], use='pairwise.complete')

# What does the use = pairwise.complete do?
?cor 
# It tells R how to deal with missing values. Here, we're telling to to use only values for which
# a given occupation has both focal values.
# The alternative (and the default) is listwise deletion. This excludes any observations for which
# even one variable of the data record is missing.

# What do you notice? Which variables are highly correlated, which are not? Do the patterns make sense?
# Education and percentage women are relatively uncorrelated, as are percentage women and prestige.  
# However, there are moderate to high correlations between all the other variables. For instance, 
# education level is highly positively correlated with income, as is the prestige of the occupation 
# (makes sense!). There are also some depressing facts, e.g. that the percentage of women in an 
# occupation is negatively correlated with income.
# Note that the data come from 1971, so (hopefully) things have changed a bit since then.

# What if we also want p values for these correlations?
library(psych)
?corr.test
corr.test(d[, c('income', 'education', 'women', 'prestige')], use='complete')
# Let's save it as an object so we can easily call it later:
dcors = corr.test(d[, c('income', 'education', 'women', 'prestige')], use='complete')

# Why can't we just use 'd' instead of doing this subsetting?
# There is a "type" variable that is a factor with more than 2 levels. R cannot compute correlations
# for variables like this one (i.e., categorical variables with > 2 levels).

# We can also look at these correlations visually, by looking at all the bivariate scatterplots
spm(d[, c('income','education', 'women', 'prestige')]) # this command lives in the car package

# What do the plots along the diagonal represent? What about the red and green lines? How can we find out?
?spm 
# The green line represents a linear regression fit between the two variables, the red curve is a LOESS regression fit.
# We won't talk about LOESS regression methods in this course, but look them up if you are interested. Basically,
# it's a non-linear method of fitting a curve that is sensitive to local patterns in the data.


####  Quick Review   ####

# We are interested in whether occupations with more years of education offer higher average income
# Fit a model that tests this hypothesis

m1 =lm(income ~ education, data = d)

# Evaluate the model. 
modelSummary(m1)
modelEffectSizes(m1)
# Is education significantly related to income?
# Yes
# What direction is the effect? 
# Positive
# How large is the effect?
# Very. Education accounts for 33.4% of the variability in income.


##################################################################################
#### Models with multiple predictors #############################################
##################################################################################

# Before collecting the data, let's assume that your PI suggested you fit a model 
# with percentage of women as a covariate.
# In general, why might it make sense to add a covariate of this kind? 
# One reason is to increase statistical power by reducing the unexplained variance in the DV.
dcors
# It's reasonable to expect that, seeing as percent women is correlated with income but not education

# Another is to answer a slightly different question. What would that question be here?
# Holding gender constant, what is the effect of education on income?
# This might be done in response to a reviewer who says something like "I think what's
# really driving the effect is gender, as men are more educated than women" (it's an
# older reviewer who hasn't consulted high school and college graduation rates recently)

# Fit a model predicting income from education and the percentage of women in a given profession.
m2 = lm(income ~ education + women, data = d)
modelSummary(m2) 

# How do we see that adding percent women gives us more power to test the effect of education?
modelEffectSizes(m2)
modelEffectSizes(m1)
# Compare partial eta-squared in the models with and without the covariate.
# It's now much larger! When controlling for percent women, education explains
# 46% of the variance in income. That's a huge amount.

# Notice also how our CI bands narrow after we add the covariate to the model:
library(effects)
plot(effect('education', m1))
plot(effect('education', m2))
# CI bands narrow because we added a good covariate (women)
# we are removing some of the previously unexplained variance in the DV (income)
# *Note: remember you can use the back and forward arrows in the plots pane to toggle between graphs.

# Did our parameter estimate for education change? Is that surprising?
coef(m1)
coef(m2)
# Our parameter estimate changes slightly, since we added a covariate to the model.


################################################################
####   3 predictor models   ####################################
################################################################

# Why might we add a third predictor? (four main situations)

# 1. Statistical power: to increase power to test focal predictor’s effect on DV 
# 2. Additional explanatory power: to demonstrate that focal predictor adds
#    explanatory power above and beyond other predictor(s)
# 3. Precision: allows us to answer more specific questions.
# 4. Mediation (to be covered in a later unit)

# New question: Does education explain variance in income above and beyond prestige?
# Maybe jobs that require more education are also more prestigeous, and it's prestige 
# that's really related to income.
# Is there an effect of education on income, even when we control for prestige?

# Fit a 3rd model predicting income from education, perceived prestige, and percentage of women
m3 =lm(income ~ education + women + prestige, data = d)
modelSummary(m3) 
modelEffectSizes(m3)

# What is the interpretation of the intercept now?
# The income predicted by the model when the variables education, women, and prestige equal zero.
# In other words, the income for an occupation with no women, for which you need no education and 
# that has a prestige score of zero on our scale.

# This isn't particularly meaningful. One way to make the parameter estimate for the intercept more 
# meaningful is to mean-center our predictor variables, and refit the model
d$educationC = d$education - mean(d$education)
d$womenC = d$women - mean(d$women)
d$prestigeC = d$prestige - mean(d$prestige)

m3C = lm(income ~ educationC + womenC + prestigeC, data = d)
modelSummary(m3C) 

# What is the interpretation of the intercept now?
# The predicted income for a job with average percent women, average education, and average prestige.
# Compare the model summary output between m3 and m3C. What changed? What hasn't changed?
modelSummary(m3)
# Why is the standard error for the intercept so much larger for the model with uncentered 
# predictors as compared to the model with centered predictors? 
# Our predictions are much more precise around the mean values of our predictors, as 
# compared to extreme values of our predictors


# Do any of the effect sizes change between the two models?
modelEffectSizes(m3)
modelEffectSizes(m3C)
# Yes, for the intercept, but not for our predictors

# What is the interpretation of the coefficient for education and what does the coefficient tell us? 

# This coefficient tells us the anticipated change in annual occupational income associated with 
# each additional year of education when the percentage of women AND the prestige of the occupation 
# are held constant. 


# There is a dramatic change in the coefficient for education. It has gone from being quite large 
# ($944, from the 3 parameter model) to not so large ($177, from the 4 parameter model). Why?

# We can find a partial answer to this conundrum by revisiting the correlation
# between education and prestige.
dcors
dcors$r[2,4]
# Education and prestige have an extremely strong relationship (r = .85).  In
# fact, if we predict prestige from education, we will find that every extra
# year of education is associated with a 5.4 unit increase in occupational prestige,
# and that education accounts for 72% of the variance in occupational prestige.
m.pres = lm(prestige ~ education, data = d)
modelSummary(m.pres)
modelEffectSizes(m.pres)

# Because of the strong relationship between prestige and education, it's likely
# that a lot of the variance in education that predicts income will overlap with
# prestige.  In fact, the dramatic change in the coefficient for education
# when prestige is added to the model tells us precisely that: the
# variance in education that predicts income overlaps with the variance in
# prestige that predicts income.

# We can directly test for multicollinearity of all predictors in our model using this function:
?vif
vif(m3)
# vif stands for 'variance inflation factor' 
# If you have a vif value >= 5 for your focal predictor(s), then multicollinearity is a problem
# If it is less than 5 but still close (around 4 for education and prestige in this example)
# then there is some multicollinearity that is affecting your model, but not necessarily critically so.
# We don't really care about this much unless it spells trouble for our focal variables.


# What is the interpretation of the coefficient for women?
modelSummary(m3C)
# It is the amount by which anticipated income decreases for every additional percent of women
# present in a given occupation, holding education and prestige constant.


# The education coefficients vary dramatically between model 2 and model 3. But what if 
# our main predictor had been prestige, instead of education, in model 2? Would the prestige 
# coefficients similarly change when adding the third predictor? Would the magnitude of the 
# effect size for prestige change?
# Fit a model (m2b) predicting income from percent women and prestige. Compare this model to model m3C.

m2b = lm(income ~ women + prestige, data = d)
modelSummary(m2b)
modelSummary(m3C)
# The coefficients for prestige don't change as drastically ($166 in the 3 parameter model to 
# $141 in the 4 parameter model) as those for education.
# Note also how the SE for prestige doubles when education is added to the model

modelEffectSizes(m3C)
modelEffectSizes(m2b)
# There is still a dramatic change in the magnitude of the effect of prestige when education is added.


# What is the model comparison interpretation of model R-squared for model 3C? 
modelR2(m3C)
# It is the test of the reduction in error (SSE) for the augmented 3 parameter
# model relative to the 1 parameter (Mean-Only) model. For the computation of model R-squared
# the augmented model is ALWAYS compared to the mean-only model. Note this is a three-
# parameter test.


######################################################################################
####             Graphing              ###############################################
######################################################################################
# * may skip in class to be able to do exercise

# Create a plot showing the effect of education on income when controlling for percentage 
# of women and prestige
# What values should we choose for percent women and prestige?
# We'll use the mean

# Quick and dirty look at the data
plot(effect('educationC', m3C)) #produces CI bands

# Publication-quality graphing
# Create our set of values on our predictors for which we want predictions
pX = data.frame(education=seq(min(d$education), max(d$education), length = 100), 
                prestige = mean(d$prestige), women = mean(d$women))
View(pX)
pY = modelPredictions(m3, pX)
View(pY)

# Basic plot elements
plot1 = ggplot(data=d, aes(x = education, y = income))
plot1

# Add raw data
plot1 = plot1 + geom_point()
plot1

# Add regression line
plot1 = plot1 + geom_smooth(aes(y = Predicted, ymin = CILo, ymax = CIHi),
                            data = pY, stat = 'identity', color='red')
plot1

# Make it pretty
plot1 = plot1 + theme_bw(base_size = 14) + 
  scale_x_continuous('Education', breaks = seq(6, 16, by=2)) +   # x-axis label and tick mark location. 
  scale_y_continuous('Income', breaks = seq(0, 26000, by=5000))  # y-axis label and tick mark location.
plot1

# If you were asked to draw a best-fitting regression line on the graph based on the raw data,
# this probably isn't what it would look like. Why does it appear to fit the data poorly?
# The raw data aren't adjusted for prestige or percent women, while the regression line is.
# We can add another regression line reflecting the results before we controlled for prestige
# and percent women.
pY2 = modelPredictions(m2, pX)
plot1 = plot1 + geom_smooth(aes(y = Predicted, ymin = CILo, ymax = CIHi),
                            data=pY2, stat='identity', color='green')
plot1