# HW 8 Key # Psych 610/710 library(car) library(lmSupport) library(ggplot2) # 1. Descriptive stats and univariate plots d <- UN str(d) summary(d) varDescribe(d) # graphing infant mortality and gdp plots in same window old.par = par() # store default settings par(mfrow=c(1,2)) # change default settings (define number of plots per figure) varPlot(na.omit(d$infant.mortality), VarName='Infant Mortality') varPlot(na.omit(d$gdp), VarName='GDP') # 2. Construct a model to predict infant mortality from national GDP. m1 <- lm(infant.mortality ~ gdp, data=d) modelSummary(m1) confint(m1) modelEffectSizes(m1) # A word about missing data: When you pass a dataframe to the lm() function to # fit a linear model, all rows (i.e. observations) for which there is missing # data in predictor variables are dropped. This is done internally # with na.omit(). na.omit() takes a dataframe as its argument, and returns a new # dataframe that contains only the complete observations in the original # dataframe. Additionally, if a function does not handle missing data well, using na.omit() # is one way to side-step the problem. However, in practice you need to be mindful # about your handling of missing data. # Considering that lm() automatically drops rows in your dataframe that contain missing # values, what if you want to see which observations are used in fitting your model? m1$model # returns list of observations used for fitting model # 3. Conduct case analysis and remove outliers [Note: The point is not that you remove # exactly the points I remove. Explain your decision succinctly, and move on.] ### Leverage pts.hval <- modelCaseAnalysis(m1,'HATVALUES') d[pts.hval$Rownames,] # It looks like Austria, Denmark, Germany, Japan, Luxembourg, Norway, and # Switzerland have high leverage (i.e. have GDPs far from the mean of GDP in our # sample) ### Model Outliers windows() pts.rstu <- modelCaseAnalysis(m1,'RESIDUALS') d[pts.rstu$Rownames,] # There were no flagrant model outliers # Note: It is clear at this point that the model residuals are quite positively # skewed. After making transformations, the interpretation of outliers might # change. ### Influence on model windows() pts.cooksd <- modelCaseAnalysis(m1,'COOKSD') d[pts.cooksd$Rownames,] # Japan, Sierra.Leone, and Switzerland have the most influence on the model overall. ### Influence on parameter estimate windows() modelCaseAnalysis(m1,'DFBETAS') # Japan, and Switzerland have the most influence on GDP ### Influence Plots windows() pts.infplot <- modelCaseAnalysis(m1,'INFLUENCEPLOT') d[pts.infplot$Rownames,] # To be conservative, let's omit all cases with influence on any parameter # (including the intercept). InfluentialNations <- c('Sierra.Leone',"Luxembourg","Norway","Denmark","Switzerland","Japan") dFull <- d d <- dfRemoveCases(d,InfluentialNations) # 4. Refit the model with these outliers removed. m2 <- lm(infant.mortality ~ gdp, data=d) modelSummary(m2) # 5. Check for violations of assumptions. ### Normality modelAssumptions(m2,Type="NORMAL") # Residuals definitely look non-normal. ### Constant variance modelAssumptions(m2,Type="CONSTANT") # Despite the console readout indicating the assumption of constant variance is met... # this looks very not constant. ### Linearity modelAssumptions(m2,Type="LINEAR") # The relationship looks exceedingly non-linear. # Since the assumptions are violated even excluding the outliers, it might make # sense to put the outliers back in. Check if that changes the state of our # assumptions, and if not, try to satisfy the assumptions with the full dataset. # It might be that once the data are transformed, the outliers look more like the # rest of the data, and may no longer need to be removed. # 6. Suggested Box-Cox transformation? Preferable interpretable transformation? modelBoxCox(m1) # boxcox recommends a negative power transformation that is near zero. That # suggests that a log transformation may do the trick (and would certainly be # more easily interpreted). # 7. Transform infant mortality dFull$infant.mortalityLog2 <- log2(dFull$infant.mortality) mt1 <- lm(infant.mortalityLog2 ~ gdp, data=dFull) modelAssumptions(mt1,Type="NORMAL") modelAssumptions(mt1,Type="CONSTANT") modelAssumptions(mt1,Type="LINEAR") # Transforming infant mortality with log base 2 perceptibly brings the data # more in line with the assumptions. # 8. Transform GDP dFull$gdpLog2 <- log2(dFull$gdp) mt2 <- lm(infant.mortality ~ gdpLog2, data=dFull) modelAssumptions(mt2,Type="NORMAL") modelAssumptions(mt2,Type="CONSTANT") modelAssumptions(mt2,Type="LINEAR") # Transforming just GDP helps combat the skew and kurtosis in the residuals. # The variance of residuals still appears uneven upon visual inspection (although # the test says the homoscedasticity assumption is met!). The residuals # look quite non-linear. # 9. Both transformations mt3 <- lm(infant.mortalityLog2 ~ gdpLog2, data=dFull) modelAssumptions(mt3,Type="NORMAL") modelAssumptions(mt3,Type="CONSTANT") modelAssumptions(mt3,Type="LINEAR") # After transforming both, we have residuals that visually conform much better # with expectations, particularly in the case of the consistency of variance. # The normality of the residuals has been compromised a bit---they are a little # too peaked, and there in fact appear to be two peaks. Still, it is certainly better # than modeling the raw data. Now, it would be wise to redo the case analysis on # the transformed data. # 10. Conduct case analysis again. pts.hval <- modelCaseAnalysis(mt3,'HATVALUES') # Sao Tome and Sudan have leverage. pts.rstu <- modelCaseAnalysis(mt3,'RESIDUALS') # Tonga and Iraq might be model outliers. pts.cooksd <- modelCaseAnalysis(mt3,'COOKSD') # Iraq, Sao Tome, Sudan, and Bosnia have general influence. windows() modelCaseAnalysis(mt3,'DFBETAS') # Iraq, Sao Tome, Sudan and Bosnia have influence on estimate pts.infplot <- modelCaseAnalysis(mt2,'INFLUENCEPLOT') # Sao Tome, Sudan, Bosnia, and Iraq influence the relationship of GDP and infant # mortality. # This set of outliers is completely different! # Note: These outliers are completely different from the original outliers in the # model with raw data. Why? # Before transforming, it looks as if countries with the highest GDP had extreme # leverage and influence on the model. After transforming, the points with # influence are those that are worst fit by the model, particularly those that # have quite low infant mortality given their GDP. Because of the severe # positive skew, the most influential points were the ones with tremendous # leverage. They would have also been model outliers because of the nonlinearity # of the relationship between GDP and infant mortality. However, the # transformations make things much more linear, and scrunches the high-GDP # countries in closer to the rest of the group. So, they are no longer outliers # or high leverage items. # 11. Remove these outliers and refit the model. Interpret the coefficient # associated with GDP. InfluentialNationsT <- c("Sao.Tome","Sudan","Bosnia","Iraq","Tonga") d <- dfRemoveCases(dFull, InfluentialNationsT) d$infant.mortalityLog2 <- log2(d$infant.mortality) d$gdpLog2 <- log2(d$gdp) mt2 <- lm(infant.mortalityLog2 ~ gdpLog2, data=d) modelSummary(mt2) modelEffectSizes(mt2) confint(mt2) # For every fourfold increase in GDP, the model predicts that infant mortality will be halved. # Every unit change of a log base 2 variable means a doubling of the raw values. # 12. Comparing SEs modelSummary(m1) modelSummary(mt3) # SSE gets lower in the model with transformed variables # SE for intercept gets lower in model with transformed variables # SE gets higher for estimate of gdp with transformed variables # THEY DIFFER BETWEEN THE MODELS BECAUSE THE MODELS ARE ASKING DIFFERENT QUESTIONS!!! # It's not fair to compare the standard error between the untransformed # and transformed models because the units are different between the two models # In other words, you wouldn't compare the standard error between a model predicting monkeys' consumption # of regularly curved bananas with the standard error in a model predicting monkeys' consumption of bananas # that you carefully tried to straighten out (without breaking). # It may happen to be true that you can more accurately predict monkey's consumption of your new straight bananas # but that's not necessarily because there is anything better about the new bananas. # In fact, I would argue that it is more practically useful to know about how many regular bananas # monkeys eat (just in case I decide to open a banana stand near a jungle). # 13. Plot the effect of GDP on infant mortality. # Create our set of values on our predictors for which we want predictions Yhat = seq(6, 16, length = 100) dMortality <- data.frame(gdpLog2=Yhat) dMortality <- modelPredictions(mt2, dMortality) scatterplot <- ggplot() + geom_point(data=d, aes(x = gdpLog2, y = infant.mortalityLog2), color = "black") + geom_smooth(aes(ymin = CILo, ymax = CIHi, x = gdpLog2, y = Predicted), data = dMortality, stat = "identity", color="red") + theme_bw(base_size = 14) + scale_x_continuous("GDP (log2)", breaks = seq(6, 16, by=1)) + scale_y_continuous("Infant Mortality (log2)", breaks = seq(0, 8, by=1)) + labs(title = 'Effect of GDP on Infant Mortality') scatterplot # 14. Curvilinear fit line scatterplot <- ggplot(data=d, aes(x = gdp, y = infant.mortality)) + geom_point(color = "black") + stat_smooth(method = 'lm', formula = y~log(x)) + theme_bw(base_size = 14) + scale_x_continuous("GDP per capita", breaks = seq(0, 45000, by=10000)) + scale_y_continuous("Infant mortality rate", breaks = seq(0, 200, by=25)) + labs(title = 'Effect of GDP on Infant Mortality') scatterplot