#################################################### ## Week 8 Lab: Assumptions and transformations ## ## Friday, October 20th, 2017 ## #################################################### ############################################################ #### First Steps: Examine the data & check for outliers #### ############################################################ library(lmSupport) library(car) library(psych) # We are going to return to the Prestige dataset. Remind yourself what's going on here. d = dfReadDat('Lab8_DemoData.dat',SubID = "SubID") str(d) varDescribe(d) # Remember we had a value that was clearly a data entry error # So let's remove it d = dfRemoveCases(d, which(d$education==-10.7)) # varDescribe(d) # Univariate exploration varPlot(d$income) varPlot(d$education) varPlot(d$women) # Note: This is NOT how we assess normality. # Bivariate exploration # Do the correlations match your intuitions? corr.test(d[, c("income", "education", "women")]) spm(d[, c("income", "education", "women")]) # Fit our linear model predicting income from gender composition and education. mod0 <- lm(income ~ women + education, data = d) modelSummary(mod0) modelEffectSizes(mod0) #### Case Analyses: Residuals #### # Are there cases that don't fit the model? Which? res <- modelCaseAnalysis(mod0, "RESIDUALS") d[res$Rownames,] # When we conducted a case analysis with this model, we discovered two cases # (general managers and physicians) that were both ill-fit by the model. #### Case Analyses: Influence & Levarage #### # Are there cases that exert a lot of influence on our model predictions? Which? # Are they the same cases that did not fit the model? (From last analysis) inf <- modelCaseAnalysis(mod0, "INFLUENCEPLOT") d[inf$Rownames,] # We discover that the same two cases (general managers and physicians) # have a lot of influence. Furthermore, sewing machine operators have high # leverage, but relatively small influence. #### Case Analyses: Removing problematic points #### # Let's remove these datapoints before beginning testing our model # assumptions. It's possible that a couple extreme outliers can # cause assumptions to be violated, and by simply eliminating them # the rest of the data conform. dFull <- d # just a backup of the full data frame d = dfRemoveCases(d, rownames(d)[which(d$name=='general.managers')]) d = dfRemoveCases(d, rownames(d)[which(d$name=='physicians')]) mod <- lm(income ~ education + women, data=d) modelSummary(mod) modelEffectSizes(mod) # We also discovered that these cases were extreme on income and that both # income and percentage women have distributions that look non-normal. # In a situation like this, we might wonder if our two problematic cases # are caused not by cases that are somehow "weird" but by the fact that # the underlying distributions of our variables are causing systematic violations # in the assumptions that the General Linear Model makes about the structure # of our data. ################################################# #### Second Step: Checking Model Assumptions #### ################################################# # The GLM makes five primary assumptions about the structure of the data: # (1) no measurement error in the predictors; # (2) independence of residuals; # (3) normality of residuals; # (4) constant variance (homoscedasticity); # (5) linearity of relationships # Violations of assumptions can cause: # 1. Inefficient standard errors (low power, type II errors) # 2. Inaccurate standard errors (incorrect statistical tests, type I errors) # 3. Inaccurate parameter estimates # Assumptions (3) - (5) can be evaluated by using modelAssumptions. # Assumption (1) is always violated to some extent, but essentially demands # that you use the most reliable measures at your disposal. # Assumption (2) can be fixed by a variety of approaches (e.g., repeated measures analysis) ########################################### #### Assumption 3: Normality of errors #### ########################################### library("gvlma") # Remember that "normality" in this case refers to the model residuals, NOT the # distribution of any single variable in the model. # A quantile-comparison (Q-Q) plot of the studentized residuals readily displays # how well the studentized residuals compare to a theoretical t-distribution. # modelAssumptions() will give you this and more. modelAssumptions(mod, "NORMAL") # The function will also spit out some statisical tests of these assumptions. # THEY ARE NO SUBSTITUTE FOR YOUR OWN VISUAL INSPECTION OF THE RESIDUALS! # In the case of our model predicting income from education and percentage women, # modelAssumptions tells us that our data seem to violate all the assumptions # of the GLM. But this is not necessarily diagnostic on its own. # So, should we be concerned at this point? Well, we're not doing very well on # normality, but let's not make any brash decisions yet. ######################################### #### Assumption 4: Constant variance #### ######################################### # The constant variance assumption implies that, for any given predicted value # of the dependent variable, the residuals will have the same variance. # One easy way to assess the viability of this assumption is to plot the predicted # values against the observed values and assess visually whether the residuals # have more scatter at some predicted values than at others. # modelAssuptions() will generate that figure, as well as a plot of the predicted # values versus the absolute value of the residuals ("spread-level plot"). modelAssumptions(mod, "CONSTANT") # So... are we concerned now? We're not doing very well in terms of the constant # variance assumption, but again, we won't make conclusive decisions yet. ############################# ## Assumption 5: Linearity ## ############################# # If the residuals are linear, then the slope of the best fitting line should be # approximately the same no matter what sub-sample of the data you consider. # Imagine sorting your dataframe on one of your predictor variables, and sliding # a window down the dataframe and fitting little models for each window. If the # relationship is linear, the solutions for the sub models should be very similar, # deviating from each other randomly. On the other hand, if the relationship of # X and Y is non-linear, than the estimate of the slope will differ systematically # at different points along X. modelAssumptions() will plot such lines, overlayed on # the residual scores and in addition to the best linear fit. modelAssumptions(mod, "LINEAR") # All that detail aside, if the green line and the red line don't look systematically # different, then the relationship between that X and Y is relatively linear. Otherwise, # there may be a non-linear relationship. # So, we're actually doing pretty well in terms of the linearity assumption. There # aren't any places where the red and green lines radically diverge. ############################ #### So, what do we do? #### ############################ # We have discovered that our data violate the constant variance and normality of # errors assumptions. We will attempt to correct some of these issues by applying # transformations. But here is an interesting point: we detected our outliers in a # model with the raw scores. The outliers may be different in the model based on # transformed data. # To be thorough, it might be worth testing whether adding back these points correct # any of the violations. Spoiler alert for the sake of our lab time: they don't. # Let's consider why the assumptions were violated. # Both income and percentage women are strongly skewed. Remember that this # in and of itself may not violate the model assumption... but it is # possible to transform the skewed variables to be more normal, and this # might influence the distribution of residuals and satisfy the assumptions. varPlot(d$income) varPlot(d$women) # One place to start is by eye-balling the univarite distribution and noting # the direction of the skew. Log transformations will spread out the lower values # and condense the higher values---in other words, it will impose a rightward skew. # If the data is skewed left, this may serve to counteract this. # Raising to a power between 0 and 1 (where raising to 1/2 is the square root, # 1/3 is the cubed root, etc.) will impose a similar transformation, but may be # a little less intuitive. # Powers greater than 1 will skew in the opposite direction. # But before we start playing with these transformations on these data, # let's take a moment to clearly see what transfomrations do ############################################################ #### Quick visualization of transformations #### ############################################################ # To give a quick sense of what transforming variables looks, we can start by # using R to quickly create some fake data x <- 1:100 y <- 1:100 # If we were to Plot these two variables against each other # what would it look like? plot(x, y) # It's a straight line with a positive slope # Now let's see what some transformations of x do # We can try several differen't powers #-2 plot(x^-2, y) # Slope now negative, line sharply increases in slope with decreasing x #-.5 plot(x^-.5, y) # Slope still negative, decrease in slope less drastic #.2 plot(x^.2, y) # Slope increases slightly with increasing x #1.5 plot(x^1.5, y) # Slope now decreases with increasing x #5 plot(x^5, y) # The slope is now very steep for low values of x, but decreases # sharply with increasing x #log plot(log(x), y) # ln x, loge x, or log x # The slope is now very low for low values of x, but increases # sharply with increasing x # Transformations of y # #-2 plot(x, y^-2) # Slope starts steeply negative, but slope increases close to 0 with increasing x #-.5 plot(x, y^-.5) # Slope decreases a little less quickly with increasing x #.2 plot(x, y^.2) # Positive slope decreases with increasing x #1.5 plot(x, y^1.5) # Slope starts positive, and the slope increases with increasing x #5 plot(x, y^5) # The slope now increases more rapidly with increasing x #log plot(x, log(y)) # The slope starts high, but decreases with increasing x #Now that we have a better understanding of what transforming data can do, let's return to # our dataset and use modelBoxCox to help us figure out what transformatation to use. varPlot(d$income) varPlot(d$women) # modelBoxCox finds a power to raise your dependent variable to that is most likely # to correct model assumption problems. modelBoxCox(mod) # These results recommend raising income to the .51 power. Not the easiest thing to # interpret. But, clearly we want to spread out the lower values and contract the high # values, because that is what powers between 0 and 1 do. Logs do this to, so we might # transform by "log base 2" using log2(). # A one unit increase in log2(income), represents a doubling of income. # We'll apply that transformation and refit the model. mod1 <- lm(log2(income) ~ women + education, data = d) modelSummary(mod1) # How do we interpret the coefficients of this model? # Every year of additional education leads to a 0.17 unit increase in log2(income). # In other words, for every 6 years of acquired in education, income is expected # to double. # So, did this fix our model assumptions? modelAssumptions(mod1, "NORMAL") modelAssumptions(mod1, "CONSTANT") modelAssumptions(mod1, "LINEAR") # ...a little bit, but we still have some problems. # We could try transforming percent women as well. mod2 <- lm(log2(income) ~ sqrt(women) + education, data = d) modelSummary(mod2) modelAssumptions(mod2, "NORMAL") modelAssumptions(mod2, "CONSTANT") modelAssumptions(mod2, "LINEAR") # This transformation doesn't really improve the viability of our # assumptions and complicates the interpretation of our regression coefficients. # Probably not worthwhile. # Remember, that violating the assumptions of linearity, equal variance, and normality # are not death knells for a model. Normality is the least concerning. They inflate # standard error, and violating linearity may bias the parameter estimates, but all # three can often be handled through transformations. # Our next step might be to consider models of the data that make different assumptions # about the distribution of residuals. It may be that the GLM is not the right tool for # the job. That said, remember that GLM is quite robust to violations of normality and # constant variance. Linearity is a little more troublesome, but even so small violations # are not worth panicking and running to find a more exotic method. Take a measured approach # and seek advice from those with relevant expertise if you feel your data violate # the GLM assumptions in an irreperable way. We will explore some options next semester. # But, let us be satisfied with the model with log(income) # ...after we conduct a case analysis on it! All the threats of influential data points # still exist in models with transformed variables. We'll want to check if there are # any points worth removing. res2 <- modelCaseAnalysis(mod1, "RESIDUALS") d[res2$Rownames,] inf2 <- modelCaseAnalysis(mod1, "INFLUENCEPLOT") d[inf2$Rownames,] # What did we learn? # Babysitters and newsboys are regression outliers. Their level of education and, in the # case of newsboys, few women don't predict their low income very well. # Remove these problem cases d = dfRemoveCases(d, rownames(d)[which(d$name=='newsboys')]) d = dfRemoveCases(d, rownames(d)[which(d$name=='babysitters')]) mod3 <- lm(log2(income) ~ women + education, data = d) modelSummary(mod3) modelSummary(mod1) # Did our parameter estimates change when we removed these outliers? Standard errors? # F statistics? ################## #### Graphing #### ################## library(ggplot2) # Publication-quality graphing # Create our set of values on our predictors for which we want predictions d$log2income <- log2(d$income) Yhat <- seq(min(d$women), max(d$women), length = 100) Yhat <- data.frame(women=Yhat, education = mean(d$education)) dIncome <- modelPredictions(mod1, Yhat) scatterplot <- ggplot() + geom_point(data=d, aes(x = women, y = log2income), color = "black") + geom_smooth(aes(ymin = CILo, ymax = CIHi, x = women, y = Predicted), data = dIncome, stat = "identity", color="red") + theme_bw(base_size = 14) + scale_x_continuous("Percent women", breaks = seq(0, 100, by=10)) + # x-axis label and tick mark location. Use scale_x_continuous() because x is specified as # a continuous variable in default aes settings scale_y_continuous("Log2 Income", breaks = seq(8, 15, by=1)) + # y-axis label and tick mark location. Use scale_y_continuous() because y is specified as # a continuous variable in default aes settings labs(title = 'Effect of % Women on Income') scatterplot #### WITH A CURVED REGRESSION LINE #### dIncome$Predicted = 2^dIncome$Predicted scatterplot <- ggplot() + geom_point(data=d, aes(x = women, y = income), color = "black") + geom_smooth(aes(ymin = CILo, ymax = CIHi, x = women, y = Predicted), data = dIncome, stat = "identity", color="red") + theme_bw(base_size = 14) + scale_x_continuous("Percent women", breaks = seq(0, 100, by=10)) + # x-axis label and tick mark location. Use scale_x_continuous() because x is specified as # a continuous variable in default aes settings scale_y_continuous("Income", breaks = seq(500, 26000, by=1500)) + # y-axis label and tick mark location. Use scale_y_continuous() because y is specified as # a continuous variable in default aes settings labs(title = 'Effect of % Women on Income') scatterplot varDescribe(d$income)