# HW 8 Key
# Psych 610/710 

library(car)
library(lmSupport)
library(ggplot2)

# 1. Descriptive stats and univariate plots
d <- UN
str(d)
summary(d)
varDescribe(d)

# graphing infant mortality and gdp plots in same window 
old.par = par() # store default settings
par(mfrow=c(1,2)) # change default settings (define number of plots per figure)
varPlot(na.omit(d$infant.mortality), VarName='Infant Mortality')
varPlot(na.omit(d$gdp), VarName='GDP')


# 2. Construct a model to predict infant mortality from national GDP. 
m1 <- lm(infant.mortality ~ gdp, data=d)
modelSummary(m1)
confint(m1)
modelEffectSizes(m1)

# A word about missing data: When you pass a dataframe to the lm() function to
# fit a linear model, all rows (i.e. observations) for which there is missing
# data in predictor variables are dropped. This is done internally
# with na.omit(). na.omit() takes a dataframe as its argument, and returns a new
# dataframe that contains only the complete observations in the original
# dataframe. Additionally, if a function does not handle missing data well, using na.omit() 
# is one way to side-step the problem. However, in practice you need to be mindful 
# about your handling of missing data. 

# Considering that lm() automatically drops rows in your dataframe that contain missing
# values, what if you want to see which observations are used in fitting your model? 
m1$model  # returns list of observations used for fitting model


# 3. Conduct case analysis and remove outliers [Note: The point is not that you remove 
# exactly the points I remove. Explain your decision succinctly, and move on.]

### Leverage 
pts.hval <- modelCaseAnalysis(m1,'HATVALUES')
d[pts.hval$Rownames,]
# It looks like Austria, Denmark, Germany, Japan, Luxembourg, Norway, and 
# Switzerland have high leverage (i.e. have GDPs far from the mean of GDP in our
# sample)

### Model Outliers
windows()
pts.rstu <- modelCaseAnalysis(m1,'RESIDUALS')
d[pts.rstu$Rownames,]
# There were no flagrant model outliers
# Note: It is clear at this point that the model residuals are quite positively
# skewed. After making transformations, the interpretation of outliers might
# change.

### Influence on model 
windows()
pts.cooksd <- modelCaseAnalysis(m1,'COOKSD')
d[pts.cooksd$Rownames,]
# Japan, Sierra.Leone, and Switzerland have the most influence on the model overall.

### Influence on parameter estimate 
windows()
modelCaseAnalysis(m1,'DFBETAS')
# Japan, and Switzerland have the most influence on GDP


### Influence Plots
windows()
pts.infplot <- modelCaseAnalysis(m1,'INFLUENCEPLOT')
d[pts.infplot$Rownames,]
# To be conservative, let's omit all cases with influence on any parameter
# (including the intercept).

InfluentialNations <- c('Sierra.Leone',"Luxembourg","Norway","Denmark","Switzerland","Japan") 
dFull <- d
d <- dfRemoveCases(d,InfluentialNations)

# 4. Refit the model with these outliers removed. 
m2 <- lm(infant.mortality ~ gdp, data=d)
modelSummary(m2)

# 5. Check for violations of assumptions.

### Normality
modelAssumptions(m2,Type="NORMAL")
# Residuals definitely look non-normal.

### Constant variance
modelAssumptions(m2,Type="CONSTANT")
# Despite the console readout indicating the assumption of constant variance is met... 
# this looks very not constant.

### Linearity
modelAssumptions(m2,Type="LINEAR")
# The relationship looks exceedingly non-linear.

# Since the assumptions are violated even excluding the outliers, it might make 
# sense to put the outliers back in. Check if that changes the state of our
# assumptions, and if not, try to satisfy the assumptions with the full dataset.
# It might be that once the data are transformed, the outliers look more like the
# rest of the data, and may no longer need to be removed.

# 6. Suggested Box-Cox transformation? Preferable interpretable transformation?

modelBoxCox(m1)
# boxcox recommends a negative power transformation that is near zero. That 
# suggests that a log transformation may do the trick (and would certainly be
# more easily interpreted). 

# 7. Transform infant mortality
dFull$infant.mortalityLog2 <- log2(dFull$infant.mortality)
mt1 <- lm(infant.mortalityLog2 ~ gdp, data=dFull)
modelAssumptions(mt1,Type="NORMAL")
modelAssumptions(mt1,Type="CONSTANT")
modelAssumptions(mt1,Type="LINEAR")

# Transforming infant mortality with log base 2 perceptibly brings the data
# more in line with the assumptions.

# 8. Transform GDP
dFull$gdpLog2 <- log2(dFull$gdp)
mt2 <- lm(infant.mortality ~ gdpLog2, data=dFull)
modelAssumptions(mt2,Type="NORMAL")
modelAssumptions(mt2,Type="CONSTANT")
modelAssumptions(mt2,Type="LINEAR")

# Transforming just GDP helps combat the skew and kurtosis in the residuals. 
# The variance of residuals still appears uneven upon visual inspection (although
# the test says the homoscedasticity assumption is met!). The residuals
# look quite non-linear.

# 9. Both transformations
mt3 <- lm(infant.mortalityLog2 ~ gdpLog2, data=dFull)
modelAssumptions(mt3,Type="NORMAL")
modelAssumptions(mt3,Type="CONSTANT")
modelAssumptions(mt3,Type="LINEAR")

# After transforming both, we have residuals that visually conform much better
# with expectations, particularly in the case of the consistency of variance.
# The normality of the residuals has been compromised a bit---they are a little
# too peaked, and there in fact appear to be two peaks. Still, it is certainly better
# than modeling the raw data. Now, it would be wise to redo the case analysis on
# the transformed data.


# 10. Conduct case analysis again.

pts.hval <- modelCaseAnalysis(mt3,'HATVALUES')
# Sao Tome and Sudan have leverage.

pts.rstu <- modelCaseAnalysis(mt3,'RESIDUALS')
# Tonga and Iraq might be model outliers.

pts.cooksd <- modelCaseAnalysis(mt3,'COOKSD')
# Iraq, Sao Tome, Sudan, and Bosnia have general influence.

windows()
modelCaseAnalysis(mt3,'DFBETAS')
# Iraq, Sao Tome, Sudan and Bosnia have influence on estimate 



pts.infplot <- modelCaseAnalysis(mt2,'INFLUENCEPLOT')
# Sao Tome, Sudan, Bosnia, and Iraq influence the relationship of GDP and infant
# mortality.

# This set of outliers is completely different! 

# Note: These outliers are completely different from the original outliers in the
# model with raw data. Why?

# Before transforming, it looks as if countries with the highest GDP had extreme
# leverage and influence on the model. After transforming, the points with 
# influence are those that are worst fit by the model, particularly those that 
# have quite low infant mortality given their GDP. Because of the severe
# positive skew, the most influential points were the ones with tremendous
# leverage. They would have also been model outliers because of the nonlinearity
# of the relationship between GDP and infant mortality. However, the
# transformations make things much more linear, and scrunches the high-GDP
# countries in closer to the rest of the group. So, they are no longer outliers
# or high leverage items.

# 11. Remove these outliers and refit the model. Interpret the coefficient
# associated with GDP.
InfluentialNationsT <- c("Sao.Tome","Sudan","Bosnia","Iraq","Tonga")
d <- dfRemoveCases(dFull, InfluentialNationsT)
d$infant.mortalityLog2 <- log2(d$infant.mortality)
d$gdpLog2 <- log2(d$gdp)
mt2 <- lm(infant.mortalityLog2 ~ gdpLog2, data=d)
modelSummary(mt2)
modelEffectSizes(mt2)
confint(mt2)
# For every fourfold increase in GDP, the model predicts that infant mortality will be halved.
# Every unit change of a log base 2 variable means a doubling of the raw values.

# 12. Comparing SEs
modelSummary(m1)
modelSummary(mt3)


# SSE gets lower in the model with transformed variables
# SE for intercept gets lower in model with transformed variables
# SE gets higher for estimate of gdp with transformed variables 
# THEY DIFFER BETWEEN THE MODELS BECAUSE THE MODELS ARE ASKING DIFFERENT QUESTIONS!!!
# It's not fair to compare the standard error between the untransformed
# and transformed models because the units are different between the two models
# In other words, you wouldn't compare the standard error between a model predicting monkeys' consumption
# of regularly curved bananas with the standard error in a model predicting monkeys' consumption of bananas
# that you carefully tried to straighten out (without breaking).
# It may happen to be true that you can more accurately predict monkey's consumption of your new straight bananas
# but that's not necessarily because there is anything better about the new bananas.
# In fact, I would argue that it is more practically useful to know about how many regular bananas
# monkeys eat (just in case I decide to open a banana stand near a jungle).


# 13. Plot the effect of GDP on infant mortality. 

# Create our set of values on our predictors for which we want predictions
Yhat = seq(6, 16, length = 100)
dMortality <- data.frame(gdpLog2=Yhat)
dMortality <- modelPredictions(mt2, dMortality)

scatterplot <- ggplot() + 
  geom_point(data=d, aes(x = gdpLog2, y = infant.mortalityLog2), color = "black") + 
  geom_smooth(aes(ymin = CILo, ymax = CIHi, x = gdpLog2, y = Predicted), 
              data = dMortality, stat = "identity", color="red") +
  theme_bw(base_size = 14) + 
  scale_x_continuous("GDP (log2)", breaks = seq(6, 16, by=1)) + 
  scale_y_continuous("Infant Mortality (log2)", breaks = seq(0, 8, by=1)) + 
  labs(title = 'Effect of GDP on Infant Mortality')
scatterplot

# 14. Curvilinear fit line

scatterplot <- ggplot(data=d, aes(x = gdp, y = infant.mortality)) +
  geom_point(color = "black") + 
  stat_smooth(method = 'lm', formula = y~log(x)) +
  theme_bw(base_size = 14) + 
  scale_x_continuous("GDP per capita", breaks = seq(0, 45000, by=10000)) +   
  scale_y_continuous("Infant mortality rate", breaks = seq(0, 200, by=25)) + 
  labs(title = 'Effect of GDP on Infant Mortality')
scatterplot