########################
#### HOMEWORK 6 KEY ####
########################

# 1. Read in the data
library(lmSupport)
library(psych)
d <- dfReadDat("HW6.dat")
varDescribe(d)
# One of the concern items goes up to 12, which is impossible.
varPlot(d$Concern1)
d[d$Concern1==12,] # Only one, participant 02.
d[d$Concern2==11,] # just to make sure.


# 2. Remove Cases
# It seems clear participant 2 had a miscoded Concern1 value, so we'll remove this participant.
d <- dfRemoveCases(d, 2) 
varDescribe(d)


# 3. Create ConcernM
d$ConcernM <- (varScore(d, Forward=c('Concern4'), Reverse=c('Concern1', 'Concern2', 'Concern3'), 
                        Range = c(1,10), MaxMiss=.25 ))/4

# 4. Creat PostIAT
d$PostIAT <- rowMeans(cbind(d$Wk4IAT,d$Wk8IAT),na.rm=TRUE)
d$PostIAT <- (d$Wk4IAT + d$Wk8IAT) / 2
# These yield the same result


# 5. Univariate plots
# Adding na.omit(d$concernM) takes away the missing values in ConcernM, ensuring that varPlot() 
# creates a density plot
varPlot(na.omit(d$ConcernM))
varPlot(na.omit(d$BaseIAT))
varPlot(na.omit(d$PostIAT))


# 6. Bivariate correlation and scatterplot matrix
corr.test(d[, c("Condition", "ConcernM", "PostIAT", "BaseIAT")], use = "pairwise.complete.obs")
spm(d[, c("Condition", "ConcernM", "PostIAT", "BaseIAT")])
# There aren't particularly strong relationships between any of these variables.


# 7. Fit the model
Mod1 <- lm(PostIAT ~ Condition + ConcernM + BaseIAT, data = d)
modelSummary(Mod1, t=F)
# Baseline IAT score reliably predicts average post-test IAT score, F(1,75) = 4.43, p < .05, but neither
# training condition, F(1,75) = 3.45, p = .0672, nor mean concern about discrimination, F(1,75) = 0.168,
# p = .683, were significant predictors of post-test IAT.


## Case Analysis ##

# 8. Leverage: Hat values
hats <- modelCaseAnalysis(Mod1, Type="HATVALUES")
hats 
d[hats$Rownames,]
varDescribe(d)
# 25 has high concern and base IAT. 18 has highbase IAT. 17 also has relatively high base IAT and 
# low concern. 65 has low concern.
# Points with high leverage can bias our parameter estimates, but this concern is more pressing when
# the points are also regression outliers / have high influence.


# 9. Residuals
resis <- modelCaseAnalysis(Mod1, Type="RESIDUALS")
resis
d[resis$Rownames,]
# Only one case seems to be separated from the rest of the data: participant 01
# This participant wasn't particularly high on leverage, because their overall postIAT and concern
# scores weren't abnormally low, overall. But, their postIAT score given their condition (no 
# training) is inconsistent with our model, and the rest of the data. So, the residual is somewhat 
# large. This might be a problematic point, but given that it doesn't meet a cut-off, we need more
# information. We may, however, suspect that this point will have high influence, at which point 
# we might consider it problematic.


# 10. Influence: Cook's distance #
cooks <- modelCaseAnalysis(Mod1, Type="COOKSD")
cooks
d[cooks$Rownames,]
# We see participant 01 is very separated from the rest of the data and might be problematic. Again, 
# we might have expected to see this given the relatively high residual. However, notice that the 
# Cook's D is less than .30. So, although the point is separated from the rest of the data, this 
# doesn't quite meet Judd et al's cut off for Cook's D for a high influence point. Therefore, this 
# isn't necessarily a clear-cut case of removal (or a clear cut case of being problematic).


# 11. Decision
# Participant 01 is clearly abnormal with respect to our model and our data. It was a regression 
# outlier and thus had the most influence on our parameter estimates. We probably want to remove 
# this case, but we should run the analysis with and without this variable and be sure to report the
# results with this case in a footnote.


# 12. Remove problematic cases
d2 <- dfRemoveCases(d,01)

# 13. Fit models with and without outliers
Mod2 <- lm(PostIAT ~ Condition + ConcernM + BaseIAT, data = d2)


# 14. Compare models with and without outliers
modelSummary(Mod1, t=F)
modelSummary(Mod2, t=F)

# Taking out the outlier reduces the error in our model overall, which we can see by the 
# difference in Rsquared between these two models. That is unsurprising because we removed a 
# regression outlier! Our parameter estimates for all predictors have changed slightly! In this case, 
# it does change the significance decision for Condition. We'd better be straightforward about 
# this when we report it. The significance tests of the other variables have not changed
# substantially.