# Homework Key for Week 2 # 1. Read in the data and inspect them (summaries, structure, descriptive stats). library(lmSupport) library(psych) d <- read.csv("stray_animals.csv") varDescribe(d) summary(d) some(d) # 2. Plot stray dogs in year 1 by number of animal shelters plot(d$Shelters,d$Dogs1) # a. Add a best fitting line. abline(lm(d$Dogs1~d$Shelters)) # b. Does the relationship appear positive or negative? Strong or weak? # Negative and relatively weak. More shelters associated with fewer strays. # 3. Create a stray index # a. Standardize scores for the three types in year one. d$Dogs1Z <- (d$Dogs1-mean(d$Dogs1)) / sd(d$Dogs1) d$Cats1Z <- (d$Cats1-mean(d$Cats1)) / sd(d$Cats1) d$OtherZ <- (d$Other-mean(d$Other)) / sd(d$Other) # b. Check your work varDescribe(d) # All have a mean of 0 and sd of one. # c. Check the reliability. What should be droped? alpha(d[,c('Dogs1Z','Cats1Z','OtherZ')]) # a = 0.89 # We shouldn't drop any of them. # d. Create Strays1 d$Strays1 <- rowMeans(d[,c('Dogs1Z','Cats1Z','OtherZ')],na.rm=T) varDescribe(d$Strays1) # e. Why standardized first? # Relates to scaling. Given that the number of stray dogs and cats is much larger than the number # of "other," the first two variables will exert too much influence on the average score if we # also care about these "other" strays. # (This problem is even more pertinent if our variables use different scales) # 4. Histogram of stray cats, with labels hist(d$Cats1, xlab = "Number of cats (in thousands)", main = "Histogram of stray cat numbers at year one") hist(d$Cats5, xlab = "Number of cats (in thousands)", main = "Histogram of stray cat numbers at year five") # 5. Create catsChange variable d$catsChange <- d$Cats5 - d$Cats1 varDescribe(d$catsChange) # 6. Two models # C: catsChange = b0 + e, b0 = 0 # A: catsChange = b1 + e, b1 = mean(catsChange) # 7. Compute the SSE for each model using brute force. # C d$Predict0 <- 0 d$Errors0 <- d$catsChange - d$Predict0 d$Squ_Err0 <- d$Errors0*d$Errors0 SSE0 <- sum(d$Squ_Err0) SSE0 # 89.69 # A d$Predict1 <- mean(d$catsChange) d$Errors1 <- d$catsChange - d$Predict1 d$Squ_Err1 <- d$Errors1*d$Errors1 SSE1 <- sum(d$Squ_Err1) SSE1 # 34.14 # 8. Compare models with F and p. n <- 50 Fstat <-((SSE0 - SSE1)/ (1-0)) /(SSE1/(n-1 )) Fstat # 79.71 dfN <- 1 - 0 dfD <- n - 1 pf(Fstat, dfN, dfD, lower.tail=FALSE) # very small # The mean is significantly different from 0. The basic model is better than # the null model. Number of cats decreased significantly from year 1 to 5. # 9. Create dosgChange. d$dogsChange <- d$Dogs5 - d$Dogs1 varDescribe(d$dogsChange) # 10. Use lm for basic model. mod1 <- lm(dogsChange ~ 1, data = d) modelSummary(mod1) # a. What does the b0 value represent? # The average change in number of stray dogs, in thousands. # b. What does the significance test of the b0 mean? # This number is significantly different from 0. The likelihood we'd obtain the # mean we did if in fact the population mean were zero is incredibly small. # c. What are the two models being compared? # The null model (predicting 0 for every state) and the basic model (predicting # the mean for every state). # d. What can we conclude based on the results of our test? # The number of stray dogs significantly increased from year 1 to year 5. # The increase was about 480 additional dogs, on average.