######################## #### HOMEWORK 6 KEY #### ######################## # 1. Read in the data library(lmSupport) library(psych) d <- dfReadDat("HW6.dat") varDescribe(d) # One of the concern items goes up to 12, which is impossible. varPlot(d$Concern1) d[d$Concern1==12,] # Only one, participant 02. d[d$Concern2==11,] # just to make sure. # 2. Remove Cases # It seems clear participant 2 had a miscoded Concern1 value, so we'll remove this participant. d <- dfRemoveCases(d, 2) varDescribe(d) # 3. Create ConcernM d$ConcernM <- (varScore(d, Forward=c('Concern4'), Reverse=c('Concern1', 'Concern2', 'Concern3'), Range = c(1,10), MaxMiss=.25 ))/4 # 4. Creat PostIAT d$PostIAT <- rowMeans(cbind(d$Wk4IAT,d$Wk8IAT),na.rm=TRUE) d$PostIAT <- (d$Wk4IAT + d$Wk8IAT) / 2 # These yield the same result # 5. Univariate plots # Adding na.omit(d$concernM) takes away the missing values in ConcernM, ensuring that varPlot() # creates a density plot varPlot(na.omit(d$ConcernM)) varPlot(na.omit(d$BaseIAT)) varPlot(na.omit(d$PostIAT)) # 6. Bivariate correlation and scatterplot matrix corr.test(d[, c("Condition", "ConcernM", "PostIAT", "BaseIAT")], use = "pairwise.complete.obs") spm(d[, c("Condition", "ConcernM", "PostIAT", "BaseIAT")]) # There aren't particularly strong relationships between any of these variables. # 7. Fit the model Mod1 <- lm(PostIAT ~ Condition + ConcernM + BaseIAT, data = d) modelSummary(Mod1, t=F) # Baseline IAT score reliably predicts average post-test IAT score, F(1,75) = 4.43, p < .05, but neither # training condition, F(1,75) = 3.45, p = .0672, nor mean concern about discrimination, F(1,75) = 0.168, # p = .683, were significant predictors of post-test IAT. ## Case Analysis ## # 8. Leverage: Hat values hats <- modelCaseAnalysis(Mod1, Type="HATVALUES") hats d[hats$Rownames,] varDescribe(d) # 25 has high concern and base IAT. 18 has highbase IAT. 17 also has relatively high base IAT and # low concern. 65 has low concern. # Points with high leverage can bias our parameter estimates, but this concern is more pressing when # the points are also regression outliers / have high influence. # 9. Residuals resis <- modelCaseAnalysis(Mod1, Type="RESIDUALS") resis d[resis$Rownames,] # Only one case seems to be separated from the rest of the data: participant 01 # This participant wasn't particularly high on leverage, because their overall postIAT and concern # scores weren't abnormally low, overall. But, their postIAT score given their condition (no # training) is inconsistent with our model, and the rest of the data. So, the residual is somewhat # large. This might be a problematic point, but given that it doesn't meet a cut-off, we need more # information. We may, however, suspect that this point will have high influence, at which point # we might consider it problematic. # 10. Influence: Cook's distance # cooks <- modelCaseAnalysis(Mod1, Type="COOKSD") cooks d[cooks$Rownames,] # We see participant 01 is very separated from the rest of the data and might be problematic. Again, # we might have expected to see this given the relatively high residual. However, notice that the # Cook's D is less than .30. So, although the point is separated from the rest of the data, this # doesn't quite meet Judd et al's cut off for Cook's D for a high influence point. Therefore, this # isn't necessarily a clear-cut case of removal (or a clear cut case of being problematic). # 11. Decision # Participant 01 is clearly abnormal with respect to our model and our data. It was a regression # outlier and thus had the most influence on our parameter estimates. We probably want to remove # this case, but we should run the analysis with and without this variable and be sure to report the # results with this case in a footnote. # 12. Remove problematic cases d2 <- dfRemoveCases(d,01) # 13. Fit models with and without outliers Mod2 <- lm(PostIAT ~ Condition + ConcernM + BaseIAT, data = d2) # 14. Compare models with and without outliers modelSummary(Mod1, t=F) modelSummary(Mod2, t=F) # Taking out the outlier reduces the error in our model overall, which we can see by the # difference in Rsquared between these two models. That is unsurprising because we removed a # regression outlier! Our parameter estimates for all predictors have changed slightly! In this case, # it does change the significance decision for Condition. We'd better be straightforward about # this when we report it. The significance tests of the other variables have not changed # substantially.