####################################################
## Week 8 Lab: Assumptions and transformations    ##
## Friday, October 20th, 2017                     ##
####################################################


############################################################
#### First Steps: Examine the data & check for outliers ####
############################################################
library(lmSupport)
library(car)
library(psych)


# We are going to return to the Prestige dataset. Remind yourself what's going on here.
d = dfReadDat('Lab8_DemoData.dat',SubID = "SubID")
str(d)
varDescribe(d)

# Remember we had a value that was clearly a data entry error 
# So let's remove it 

d = dfRemoveCases(d, which(d$education==-10.7)) #
varDescribe(d)

# Univariate exploration
varPlot(d$income)
varPlot(d$education)
varPlot(d$women)
# Note: This is NOT how we assess normality.

# Bivariate exploration
# Do the correlations match your intuitions?
corr.test(d[, c("income", "education", "women")])

spm(d[, c("income", "education", "women")])

# Fit our linear model predicting income from gender composition and education.
mod0 <- lm(income ~ women + education, data = d)
modelSummary(mod0)
modelEffectSizes(mod0)


#### Case Analyses: Residuals ####
# Are there cases that don't fit the model? Which?
res <- modelCaseAnalysis(mod0, "RESIDUALS")
d[res$Rownames,]
# When we conducted a case analysis with this model, we discovered two cases 
# (general managers and physicians) that were both ill-fit by the model.


#### Case Analyses: Influence & Levarage ####
# Are there cases that exert a lot of influence on our model predictions? Which?
# Are they the same cases that did not fit the model? (From last analysis)
inf <- modelCaseAnalysis(mod0, "INFLUENCEPLOT")
d[inf$Rownames,]
# We discover that the same two cases (general managers and physicians)
# have a lot of influence. Furthermore, sewing machine operators have high
# leverage, but relatively small influence.


#### Case Analyses: Removing problematic points ####
# Let's remove these datapoints before beginning testing our model
# assumptions.  It's possible that a couple extreme outliers can
# cause assumptions to be violated, and by simply eliminating them
# the rest of the data conform.
dFull <- d # just a backup of the full data frame


d = dfRemoveCases(d, rownames(d)[which(d$name=='general.managers')])
d = dfRemoveCases(d, rownames(d)[which(d$name=='physicians')])

mod <- lm(income ~ education + women, data=d)
modelSummary(mod)
modelEffectSizes(mod)

# We also discovered that these cases were extreme on income and that both
# income and percentage women have distributions that look non-normal.
# In a situation like this, we might wonder if our two problematic cases
# are caused not by cases that are somehow "weird" but by the fact that 
# the underlying distributions of our variables are causing systematic violations
# in the assumptions that the General Linear Model makes about the structure
# of our data.



#################################################
#### Second Step: Checking Model Assumptions ####
#################################################

# The GLM makes five primary assumptions about the structure of the data:
# (1) no measurement error in the predictors;
# (2) independence of residuals;
# (3) normality of residuals;
# (4) constant variance (homoscedasticity);
# (5) linearity of relationships

# Violations of assumptions can cause:
# 1. Inefficient standard errors (low power, type II errors)
# 2. Inaccurate standard errors (incorrect statistical tests,  type I errors)
# 3. Inaccurate parameter estimates

# Assumptions (3) - (5) can be evaluated by using modelAssumptions. 
# Assumption (1) is always violated to some extent, but essentially demands 
# that you use the most reliable measures at your disposal. 
# Assumption (2) can be fixed by a variety of approaches (e.g., repeated measures analysis)

###########################################
#### Assumption 3: Normality of errors ####
###########################################

library("gvlma")

# Remember that "normality" in this case refers to the model residuals, NOT the
# distribution of any single variable in the model. 

# A quantile-comparison (Q-Q) plot of the studentized residuals readily displays 
# how well the studentized residuals compare to a theoretical t-distribution. 
# modelAssumptions() will give you this and more.

modelAssumptions(mod, "NORMAL")

# The function will also spit out some statisical tests of these assumptions.
# THEY ARE NO SUBSTITUTE FOR YOUR OWN VISUAL INSPECTION OF THE RESIDUALS!

# In the case of our model predicting income from education and percentage women,
# modelAssumptions tells us that our data seem to violate all the assumptions
# of the GLM. But this is not necessarily diagnostic on its own.

# So, should we be concerned at this point? Well, we're not doing very well on
# normality, but let's not make any brash decisions yet.

#########################################
#### Assumption 4: Constant variance ####
#########################################

# The constant variance assumption implies that, for any given predicted value
# of the dependent variable, the residuals will have the same variance.  

# One easy way to assess the viability of this assumption is to plot the predicted
# values against the observed values and assess visually whether the residuals
# have more scatter at some predicted values than at others. 

# modelAssuptions() will generate that figure, as well as a plot of the predicted 
# values versus the absolute value of the residuals ("spread-level plot").

modelAssumptions(mod, "CONSTANT")

# So... are we concerned now? We're not doing very well in terms of the constant
# variance assumption, but again, we won't make conclusive decisions yet.

#############################
## Assumption 5: Linearity ##
#############################

# If the residuals are linear, then the slope of the best fitting line should be
# approximately the same no matter what sub-sample of the data you consider. 
# Imagine sorting your dataframe on one of your predictor variables, and sliding
# a window down the dataframe and fitting little models for each window. If the 
# relationship is linear, the solutions for the sub models should be very similar,
# deviating from each other randomly. On the other hand, if the relationship of
# X and Y is non-linear, than the estimate of the slope will differ systematically
# at different points along X. modelAssumptions() will plot such lines, overlayed on
# the residual scores and in addition to the best linear fit. 

modelAssumptions(mod, "LINEAR")

# All that detail aside, if the green line and the red line don't look systematically
# different, then the relationship between that X and Y is relatively linear. Otherwise,
# there may be a non-linear relationship.

# So, we're actually doing pretty well in terms of the linearity assumption. There
# aren't any places where the red and green lines radically diverge.

############################
#### So, what do we do? ####
############################

# We have discovered that our data violate the constant variance and normality of 
# errors assumptions.  We will attempt to correct some of these issues by applying 
# transformations. But here is an interesting point: we detected our outliers in a
# model with the raw scores. The outliers may be different in the model based on 
# transformed data.

# To be thorough, it might be worth testing whether adding back these points correct
# any of the violations. Spoiler alert for the sake of our lab time: they don't. 

# Let's consider why the assumptions were violated.
# Both income and percentage women are strongly skewed. Remember that this
# in and of itself may not violate the model assumption... but it is
# possible to transform the skewed variables to be more normal, and this
# might influence the distribution of residuals and satisfy the assumptions.
varPlot(d$income)
varPlot(d$women)

# One place to start is by eye-balling the univarite distribution and noting
# the direction of the skew. Log transformations will spread out the lower values
# and condense the higher values---in other words, it will impose a rightward skew.
# If the data is skewed left, this may serve to counteract this. 

# Raising to a power between 0 and 1 (where raising to 1/2 is the square root, 
# 1/3 is the cubed root, etc.) will impose a similar transformation, but may be 
# a little less intuitive.

# Powers greater than 1 will skew in the opposite direction.

# But before we start playing with these transformations on these data, 
# let's take a moment to clearly see what transfomrations do

############################################################
#### Quick visualization of transformations ####
############################################################

# To give a quick sense of what transforming variables looks, we can start by 
# using R to quickly create some fake data 

x <- 1:100
y <- 1:100

# If we were to Plot these two variables against each other
# what would it look like? 

plot(x, y)

# It's a straight line with a positive slope

# Now let's see what some transformations of x do

# We can try several differen't powers  

#-2 
plot(x^-2, y)
# Slope now negative, line sharply increases in slope with decreasing x

#-.5
plot(x^-.5, y)
# Slope still negative, decrease in slope less drastic

#.2
plot(x^.2, y)
# Slope increases slightly with increasing x

#1.5
plot(x^1.5, y)
# Slope now decreases with increasing x

#5
plot(x^5, y)
# The slope is now very steep for low values of x, but decreases
# sharply with increasing x

#log
plot(log(x), y)
# ln x, loge x, or log x
# The slope is now very low for low values of x, but increases 
# sharply with increasing x

# Transformations of y #

#-2
plot(x, y^-2)
# Slope starts steeply negative, but slope increases close to 0 with increasing x

#-.5
plot(x, y^-.5)
# Slope decreases a little less quickly with increasing x

#.2
plot(x, y^.2)
# Positive slope decreases with increasing x

#1.5
plot(x, y^1.5)
# Slope starts positive, and the slope increases with increasing x

#5
plot(x, y^5)
# The slope now increases more rapidly with increasing x

#log
plot(x, log(y))
# The slope starts high, but decreases with increasing x

#Now that we have a better understanding of what transforming data can do, let's return to 
# our dataset and use modelBoxCox to help us figure out what transformatation to use. 

varPlot(d$income)
varPlot(d$women)


# modelBoxCox finds a power to raise your dependent variable to that is most likely
# to correct model assumption problems.

modelBoxCox(mod)
# These results recommend raising income to the .51 power. Not the easiest thing to
# interpret. But, clearly we want to spread out the lower values and contract the high
# values, because that is what powers between 0 and 1 do. Logs do this to, so we might
# transform by "log base 2" using log2(). 

# A one unit increase in log2(income), represents a doubling of income. 
# We'll apply that transformation and refit the model.

mod1 <- lm(log2(income) ~ women + education, data = d)
modelSummary(mod1)

# How do we interpret the coefficients of this model? 

# Every year of additional education leads to a 0.17 unit increase in log2(income).
# In other words, for every 6 years of acquired in education, income is expected
# to double.

# So, did this fix our model assumptions?
modelAssumptions(mod1, "NORMAL")
modelAssumptions(mod1, "CONSTANT")
modelAssumptions(mod1, "LINEAR")
# ...a little bit, but we still have some problems.

# We could try transforming percent women as well. 
mod2 <- lm(log2(income) ~ sqrt(women) + education, data = d)
modelSummary(mod2)
modelAssumptions(mod2, "NORMAL")
modelAssumptions(mod2, "CONSTANT")
modelAssumptions(mod2, "LINEAR")
# This transformation doesn't really improve the viability of our 
# assumptions and complicates the interpretation of our regression coefficients.  
# Probably not worthwhile.


# Remember, that violating the assumptions of linearity, equal variance, and normality
# are not death knells for a model. Normality is the least concerning. They inflate 
# standard error, and violating linearity may bias the parameter estimates, but all
# three can often be handled through transformations.

# Our next step might be to consider models of the data that make different assumptions
# about the distribution of residuals. It may be that the GLM is not the right tool for
# the job. That said, remember that GLM is quite robust to violations of normality and 
# constant variance. Linearity is a little more troublesome, but even so small violations
# are not worth panicking and running to find a more exotic method. Take a measured approach
# and seek advice from those with relevant expertise if you feel your data violate 
# the GLM assumptions in an irreperable way. We will explore some options next semester.

# But, let us be satisfied with the model with log(income)
# ...after we conduct a case analysis on it! All the threats of influential data points 
# still exist in models with transformed variables. We'll want to check if there are 
# any points worth removing.

res2 <- modelCaseAnalysis(mod1, "RESIDUALS")
d[res2$Rownames,]

inf2 <- modelCaseAnalysis(mod1, "INFLUENCEPLOT")
d[inf2$Rownames,]

# What did we learn?
# Babysitters and newsboys are regression outliers. Their level of education and, in the
# case of newsboys, few women don't predict their low income very well.

# Remove these problem cases
d = dfRemoveCases(d, rownames(d)[which(d$name=='newsboys')])
d = dfRemoveCases(d, rownames(d)[which(d$name=='babysitters')])

mod3 <- lm(log2(income) ~ women + education, data = d)
modelSummary(mod3)
modelSummary(mod1)
# Did our parameter estimates change when we removed these outliers? Standard errors?
# F statistics?

##################
#### Graphing ####
##################
library(ggplot2)
# Publication-quality graphing
# Create our set of values on our predictors for which we want predictions
d$log2income <- log2(d$income)

Yhat <- seq(min(d$women), max(d$women), length = 100)
Yhat <- data.frame(women=Yhat, education = mean(d$education))
dIncome <- modelPredictions(mod1, Yhat)

scatterplot <- ggplot() + 
  geom_point(data=d, aes(x = women, y = log2income), color = "black") + 
  geom_smooth(aes(ymin = CILo, ymax = CIHi, x = women, y = Predicted), 
              data = dIncome, stat = "identity", color="red") +
  theme_bw(base_size = 14) + 
  scale_x_continuous("Percent women", breaks = seq(0, 100, by=10)) +   
# x-axis label and tick mark location. Use scale_x_continuous() because x is specified as 
# a continuous variable in default aes settings
  scale_y_continuous("Log2 Income", breaks = seq(8, 15, by=1)) + 
# y-axis label and tick mark location. Use scale_y_continuous() because y is specified as 
# a continuous variable in default aes settings
  labs(title = 'Effect of % Women on Income')
scatterplot


#### WITH A CURVED REGRESSION LINE ####
dIncome$Predicted = 2^dIncome$Predicted
scatterplot <- ggplot() + 
  geom_point(data=d, aes(x = women, y = income), color = "black") + 
  geom_smooth(aes(ymin = CILo, ymax = CIHi, x = women, y = Predicted), 
              data = dIncome, stat = "identity", color="red") +
  theme_bw(base_size = 14) + 
  scale_x_continuous("Percent women", breaks = seq(0, 100, by=10)) +   
  # x-axis label and tick mark location. Use scale_x_continuous() because x is specified as 
  # a continuous variable in default aes settings
  scale_y_continuous("Income", breaks = seq(500, 26000, by=1500)) + 
  # y-axis label and tick mark location. Use scale_y_continuous() because y is specified as 
  # a continuous variable in default aes settings
  labs(title = 'Effect of % Women on Income')
scatterplot
varDescribe(d$income)