#### THE2 KEY ####

# 1.
d = dfReadDat("THE2_Ex1.dat")
varDescribe(d)
d[sample(d$PetName,6),]
varPlot(d$Score)
# Score seems to be roughly normal across the possible values. Means indicate
# even numbers of cats and dogs and 2 conditions.

# 2.
varDescribeBy(d$Score, list(d$Species, d$Condition))
# It would appear so, yes.

# 3.
m1.1 = lm(Score ~ Condition + Species, data=d)
modelSummary(m1.1)
confint(m1.1)
modelEffectSizes(m1.1)
# Cats perform better than dogs, when controlling for condition, b = 5.03, [2.21,7.86], t(104) = 3.535, p < .001, 
# peta2 = .10. Pets in the learning condition perform better than controls, controlling for species, b = 3.82, 
# [.99, 6.64], t(104) = 2.68, p < .001, peta2 = .06

# 4.
# a. 
# No. None of the pets can be extreme on the predictor variables since the predictor variables are both dichotomous.

# b.
res = modelCaseAnalysis(m1.1, "RESIDUALS")
# no extreme outliers

# c.
inf = modelCaseAnalysis(m1.1, "COOKSD")
d[inf$Rownames,]
# I identified 3: Dexter, Mortimer, and Spock.
modelCaseAnalysis(m1.1, 'DFBETAS')
# Dexter appears to be unduly influencing the Species parameter.
inf
# He also has the highest cook's d value.

# d.
# Dexter. He is far away from the rest of the data.
# Dexter is very bad at the task for a cat. Maybe he fell asleep during testing.

# You could also make an argument for removing Mortimer, Spock, and Spud. This was
# unintentionally a bit of a trick question, because Spock and Spud have equal Cook's D
# scores. If one is removed, the other should be as well. Maybe better to not remove either.

# e.
dFull = d
d = dfRemoveCases(d, which(d$PetName=='Dexter'))

# 5.
d$ConditionC = d$Condition - .5
d$SpeciesC = d$Species - .5
# OR do this using varRecode

m1.2 = lm(Score ~ SpeciesC * ConditionC, data=d)
modelSummary(m1.2)
confint(m1.2)
modelEffectSizes(m1.2)

# a.
# The main effect of species is significant when controlling for condition and their interaction,
# indicating cats are better than dogs at the task overall, b = 5.41, [2.76, 8.07], t(102) = 4.05,
# p < .01, peta2 = .138.

# b.
library(effects)
plot(effect('SpeciesC:ConditionC', m1.2))
# There is a significant interaction indicating that the difference in scores across training conditions
# is greater for dogs than for cats, meaning the training was more useful for them, b = -7.84, [-13.15, -2.54],
# t(102) = -2.93, p < .01, peta2 = .078.

# 6.
d$ConditionT = d$ConditionC - .5
varDescribe(d$ConditionT)
m1.3 = lm(Score ~ SpeciesC * ConditionT, data=d)
modelSummary(m1.3)
confint(m1.3)
modelEffectSizes(m1.3)
# There is not a significant effect of species in the training condition, t(102) = 0.79, p = .43. This means
# that dogs who are trained perform just as well on the task as cats who are trained.

# 7.
d$ConditionS = as.factor(varRecode(d$Condition, c(0,1), c('Control','Training')))
d$SpeciesS = as.factor(varRecode(d$Species, c(0,1), c('Dog','Cat')))

mPlot1 = lm(Score ~ ConditionS*SpeciesS, data=d)

X = expand.grid(ConditionS = c('Control','Training'), SpeciesS = c('Dog','Cat'))
Y = modelPredictions(mPlot1, X)

library(cowplot)
plot1 = ggplot(d, aes(x = SpeciesS, y = Score, fill = ConditionS)) + 
  geom_bar(stat='identity', position=position_dodge(.9), aes(y=Predicted), data=Y) +
  geom_errorbar(aes(ymin = CILo, ymax = CIHi, y = Predicted), data=Y, position=position_dodge(.9), width=.5) +
  scale_fill_brewer(palette = 'Accent') +
  scale_y_continuous(limits = c(0,30),expand=c(0,0)) +
  labs(x = 'Species', fill = 'Condition')
plot1

# 8 in word doc.


############### EXAMPLE 2 ############### 

# 1.
d = dfReadDat('THE2_Ex2.dat')
varDescribe(d)
d[sample(d$Fat,6),]
varPlot(d$Fat)
varPlot(d$Spice)
varPlot(d$Pumpkin)
varPlot(d$SCM)
varPlot(d$SgyBtm)
# The ingredient parameters are roughly normally distributed, with relatively fat tails. This makes
# sense based on what the researcher is trying to do. There are many more pies without soggy bottom than with.


# 2.
# a.
cor(d$Pumpkin, d$SCM)
plot(d$Pumpkin, d$SCM)
d$Pumpkin + d$SCM # 32 for all
# The variables are perfectly inversely correlated. In essence, the grad student kept the filling amount
# constant, so as one quantity increased, the other necessarily decreased accordingly.

# b.
d$Filling = d$Pumpkin - d$SCM
varPlot(d$Filling)
# min is -8. This is a pie that has 8 ounces more SCM than pumpkin pie filling
# max is 8. This is a pie that has 8 ounces more pumpkin pie filling than SCM

# c.
d$Pumpkin = NULL
d$SCM = NULL


# 3.
# a.
corr.test(d[,c('Scrummy','Beauty','Dodgy')])
alpha(d[,c('Dodgy','Beauty','Scrummy')], keys=('Dodgy')) # check.keys=T is ok
# Alpha value is .77, which is pretty good, and won't go up if any are removed.

# b.
d$Rating = varScore(d, Forward = c('Scrummy','Beauty'), Reverse='Dodgy', Range=c(1,7))/3
varPlot(d$Rating)
# Somewhat negatively skewed, but not too bad.

# 4.
m2.1 = lm(Rating ~ Filling + Spice, data=d)
modelSummary(m2.1)
# a.
modelAssumptions(m2.1, 'NORMAL') # this looks decent, though a bit wonky
modelAssumptions(m2.1, 'CONSTANT') # this looks pretty nice, though not super linear on the right pane
modelAssumptions(m2.1, 'LINEAR') # not super linear.

# b.
# I probably wouldn't transform. If I did, I'd transform the Rating variable. It's less normally distributed
# than the others and, unlike them, doesn't actually have any meaning in raw terms. Both the others are based
# on real, meaningful units.

# c.
modelBoxCox(m2.1) # 1.47
# Since it is greater than 1, this transformation would spread out larger scores and compress smaller scores.


# 5.
d$FatM = d$Fat - mean(d$Fat)
d$SgyBtmM = d$SgyBtm - .5 # varRecode also ok

m2.2 = lm(Rating ~ FatM*SgyBtmM, data=d)
modelSummary(m2.2)
modelEffectSizes(m2.2)
confint(m2.2)

# a.
plot(effect('FatM:SgyBtmM',m2.2))
# There was a positive effect of fat for pies neutral with respect to their bottom, b = 0.13, 
# [.07, .20], t(124) = 4.02, p < .001, peta2 = .12.
# There was a significant interaction indicating that the effect of amount of fat used was more
# positive for pies that did not have a soggy bottom, b = -0.14, [-0.27, -0.02], t(124) = -2.21,
# p < .05, peta2 = .04.

# b.
d$SgyBtmY = d$SgyBtmM - .5
m2.3 = lm(Rating ~ FatM*SgyBtmY, data=d)
modelSummary(m2.3)
# There is no effect of fat for pies that have a soggy bottom, t(124) = 1.11, p = .27.


# 6.
d$SpiceM = d$Spice - mean(d$Spice)
d$FillingM = d$Filling - mean(d$Filling)
m2.3 = lm(Rating ~ FillingM*SpiceM, data=d)
modelSummary(m2.3)
modelEffectSizes(m2.3)
confint(m2.3)

# a.
plot(effect('FillingM:SpiceM',m2.3))
# We did not observe a positive effect of spice for pies with an equal amount of pumpkin pie
# filling and SCM, t(124) = 1.9, p = .06. However, we did observe the predicted interaction:
# spice had a more positive effect for pies with less pumpkin in them, b = -0.05, [-0.08, -0.02],
# t(124) = -2.94, p < .01, peta2 = .065.

# b. 
# b1: For pies with an average amount of spice, each additional ounce of pumpkin pie filling versus SCM
# led to a .004 point increase in Mary's rating, n.s.
# b2: For pies with an equal amount of pumpkin pie spice and SCM, each additional teaspoon of pumpkin pie
# spice increased Mary's rating by .16 points, p = .06.
# b3: The relationship between spice amount and rating became .05 units stronger for each additional ounce
# of pumpkin pie filling versus SCM.


# 7.
# Hypothesis 1:
d$SgyBtmS = as.factor(varRecode(d$SgyBtm,c(0,1), c('Crisp','Soggy')))
mPlot21 = lm(Rating ~ Fat * SgyBtmS, data=d)
X = expand.grid(Fat = seq(min(d$Fat), max(d$Fat), length=128), SgyBtmS = c('Crisp','Soggy'))
Y = modelPredictions(mPlot21, X)

library(cowplot)
plot2 = ggplot(d, aes(x=Fat, y=Rating, color=SgyBtmS)) +
  geom_point() +
  scale_color_brewer(palette='Dark2') +
  geom_smooth(aes(ymin = CILo, ymax = CIHi, y = Predicted), stat='identity', data=Y) +
  labs(x='Amount of Fat used (oz)', color = 'Crust quality')
plot2

# Hypothesis 2:
mPlot22 = lm(Rating ~ Filling*Spice, data=d)
XL = expand.grid(Filling = seq(min(d$Filling), max(d$Filling), length=128), Spice = mean(d$Spice - sd(d$Spice)))
XH = expand.grid(Filling = seq(min(d$Filling), max(d$Filling), length=128), Spice = mean(d$Spice + sd(d$Spice)))
YL = modelPredictions(mPlot22, XL)
YH = modelPredictions(mPlot22, XH)

plot3 = ggplot(d, aes(x = Filling, y = Rating, color=Spice)) +
  geom_point() +
  geom_smooth(aes(x = Filling, ymin = CILo, ymax = CIHi, y = Predicted), data=YL, stat='identity') +
  geom_smooth(aes(x = Filling, ymin = CILo, ymax = CIHi, y = Predicted), data=YH, stat='identity') +
  scale_x_continuous(breaks=c(-8, -6, -4, -2, 0, 2, 4, 6,8)) +
  labs(x = 'Balance of canned pumpkin to SCM (oz)', color = 'Spice (tsp)')
plot3