# Homework Key for Week 2

# 1.	Read in the data and inspect them (summaries, structure, descriptive stats).

library(lmSupport)
library(psych)
d <- read.csv("stray_animals.csv")
varDescribe(d)
summary(d)
some(d)

# 2.	Plot stray dogs in year 1 by number of animal shelters

plot(d$Shelters,d$Dogs1)

# a.	Add a best fitting line.

abline(lm(d$Dogs1~d$Shelters))

# b.	Does the relationship appear positive or negative? Strong or weak?

# Negative and relatively weak. More shelters associated with fewer strays.

# 3. Create a stray index
# a. Standardize scores for the three types in year one.

d$Dogs1Z <- (d$Dogs1-mean(d$Dogs1)) / sd(d$Dogs1)
d$Cats1Z <- (d$Cats1-mean(d$Cats1)) / sd(d$Cats1)
d$OtherZ <- (d$Other-mean(d$Other)) / sd(d$Other)

# b. Check your work

varDescribe(d)
# All have a mean of 0 and sd of one.

# c. Check the reliability. What should be droped?

alpha(d[,c('Dogs1Z','Cats1Z','OtherZ')]) # a = 0.89
# We shouldn't drop any of them.

# d. Create Strays1

d$Strays1 <- rowMeans(d[,c('Dogs1Z','Cats1Z','OtherZ')],na.rm=T)
varDescribe(d$Strays1)

# e. Why standardized first?

# Relates to scaling. Given that the number of stray dogs and cats is much larger than the number
# of "other," the first two variables will exert too much influence on the average score if we 
# also care about these "other" strays.
# (This problem is even more pertinent if our variables use different scales)

# 4. Histogram of stray cats, with labels

hist(d$Cats1, xlab = "Number of cats (in thousands)", main =
       "Histogram of stray cat numbers at year one")

hist(d$Cats5, xlab = "Number of cats (in thousands)", main =
       "Histogram of stray cat numbers at year five")

# 5. Create catsChange variable

d$catsChange <- d$Cats5 - d$Cats1
varDescribe(d$catsChange)

# 6. Two models

# C: catsChange = b0 + e, b0 = 0
# A: catsChange = b1 + e, b1 = mean(catsChange)

# 7.	Compute the SSE for each model using brute force.

# C
d$Predict0 <- 0
d$Errors0 <- d$catsChange - d$Predict0
d$Squ_Err0 <- d$Errors0*d$Errors0
SSE0 <- sum(d$Squ_Err0)
SSE0 # 89.69

# A
d$Predict1 <- mean(d$catsChange)
d$Errors1 <- d$catsChange - d$Predict1 
d$Squ_Err1 <- d$Errors1*d$Errors1
SSE1 <- sum(d$Squ_Err1)
SSE1 # 34.14

# 8. Compare models with F and p.

n <- 50
Fstat <-((SSE0 - SSE1)/ (1-0)) /(SSE1/(n-1 ))
Fstat # 79.71
dfN <-  1 - 0
dfD <-  n - 1 
pf(Fstat, dfN, dfD, lower.tail=FALSE) # very small

# The mean is significantly different from 0. The basic model is better than
# the null model. Number of cats decreased significantly from year 1 to 5.

# 9. Create dosgChange. 

d$dogsChange <- d$Dogs5 - d$Dogs1
varDescribe(d$dogsChange)

# 10. Use lm for basic model.

mod1 <- lm(dogsChange ~ 1, data = d)
modelSummary(mod1)

# a. What does the b0 value represent?

# The average change in number of stray dogs, in thousands.

# b. What does the significance test of the b0 mean?

# This number is significantly different from 0. The likelihood we'd obtain the
# mean we did if in fact the population mean were zero is incredibly small.

# c. What are the two models being compared?

# The null model (predicting 0 for every state) and the basic model (predicting
# the mean for every state).

# d.	What can we conclude based on the results of our test?

# The number of stray dogs significantly increased from year 1 to year 5.
# The increase was about 480 additional dogs, on average.