## Thursday, May 28, 2015

### Savannah Sparrows are so easy to catch

 They also tend to have a lot of dirt on their beaks.  Kinda messy critters.  Savannah Sparrow, near Brooks, Newell County, Alberta.  18 May 2015.

## Tuesday, May 26, 2015

### Chestnut-collared Longspur in the hand

 Male Chestnut-collared Longspur near Brooks, Newell County, Alberta.  19 May 2015.

## Friday, May 22, 2015

### Field work in full swing!

Field work is in full swing.  I'm back out in Brooks, Alberta, and we're banding Baird's Sparrows, Savannah Sparrows, and Chestnut-collared Longspurs.  Here's one of our beautiful Baird's from yesterday!
 Baird's Sparrow, near Brooks, Newell County, Alberta.  21 May 2015.

## Tuesday, May 05, 2015

### Beginning R: a very brief survey of methods for non-normal data

This was the last lab code for the quantitative methods class I co-taught with Dr. Nicola Koper this winter semester (i.e., the semester I used to know as "spring").

#Start by setting your workspace.  Even though we don't import a file this time,
#it's good practice so any figures you generate will go to the correct folder.
#Otherwise they tend to disappear into that totally obvious folder that you then forget.
######################################################################################
#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Today we'll use a sample dataset that's already in R,
#plus generate some random data of our own.
#No need for text files or importing!

#Generate some proportions
#No proportion data, so we'll just add it.
#I generate more data than we need, then cut off <0 and >1.
inventedproportions.prelim<-rnorm(mean=0.5,
sd=0.5,
n=300)
inventedproportions.keep.logical<-inventedproportions.prelim<=1&inventedproportions.prelim>=0
inventedproportions<-inventedproportions.prelim[inventedproportions.keep.logical]
airquality\$inventedproportions<-inventedproportions[1:153]

library(gamlss.dist)

airquality\$somanyzeros<-rZIP(n=153,
mu=5,
sigma=0.3)

str(airquality)
airquality<-na.omit(airquality)
str(airquality)

###########################################
#Brief review of graphical data exploration
###########################################

par(mfrow=c(2,3))
#Histograms for normality.
hist(airquality\$Ozone)
hist(airquality\$Solar.R)
hist(airquality\$Wind)
hist(airquality\$Temp)
hist(airquality\$inventedproportions)
hist(airquality\$somanyzeros)

#Residuals are what count for our models.
model.Solar.R<-lm(Ozone~Solar.R,
data=airquality)
par(mfrow=c(2,2))
plot(model.Solar.R)
resid.model.Solar.R<-resid(model.Solar.R)
par(mfrow=c(1,1))
hist(resid.model.Solar.R)

model.somanyzeros<-lm(Ozone~somanyzeros,
data=airquality)
par(mfrow=c(2,2))
plot(model.somanyzeros)
resid.model.somanyzeros<-resid(model.somanyzeros)
hist(resid.model.somanyzeros)

###########################################
###########################################

airquality\$ln.Solar.R<-log(airquality\$Solar.R)
#add a constant if there are zeros.
#pick a constant that transforms your variable best,
#see pp. 65-66 in Quinn and Keough for explanation.
airquality\$ln.Solar.R.min<-log(airquality\$Solar.R+min(airquality\$Solar.R, na.rm=TRUE))
airquality\$ln.Solar.R.001<-log(airquality\$Solar.R+0.001)
airquality\$ln.Solar.R.1<-log(airquality\$Solar.R+1)
#Some other transformations.
airquality\$Solar.R.inverted<-1/(airquality\$Solar.R)
airquality\$sqrt.Solar.R<-sqrt(airquality\$Solar.R) #square root
airquality\$cube.root.Solar.R<-airquality\$Solar.R^(1/3) #cube root
airquality\$fourth.root.Solar.R<-airquality\$Solar.R^(1/4) #fourth root

#Box-Cox transformation.  R only lets you actually run this on a model.
#It's for transforming your response variable ONLY.
library(MASS)
par(mfrow=c(1,2))
Solar.R.boxcox<-boxcox(model.Solar.R)

#Use the graph to zoom in around the mostly likely lambda value.
Solar.R.boxcox.zoom<-boxcox(model.Solar.R,
lambda=seq(-0.05,0.35,0.1))
#From the plot, lambda should be about 0.15 (1/6 or 1/7, so a 6th or 7th root transformation.)
airquality\$sixth.root.Ozone<-airquality\$Ozone^(1/6)

airquality\$arcsinsqrt.inventedproportions<-asin(sqrt(airquality\$inventedproportions)) #arcsin square root transformation for proportions

#reflecting your data before transforming (pg. 66 in Quinn and Keough)
reflection.constant<-(max(airquality\$somanyzeros)+1)
airquality\$lnreflected.somanyzeros<-log((reflection.constant-airquality\$somanyzeros)+0.001)
#Re-evaluating your data after transformation: run the exploratory analyses again.

par(mfrow=c(1,2))
hist(airquality\$Ozone)
hist(airquality\$sixth.root.Ozone)

par(mfrow=c(2,4))
hist(airquality\$Solar.R)
hist(airquality\$ln.Solar.R.min)
hist(airquality\$ln.Solar.R.001)
hist(airquality\$ln.Solar.R.1)
#Some other transformations.
hist(airquality\$Solar.R.inverted)
hist(airquality\$sqrt.Solar.R)
hist(airquality\$cube.root.Solar.R)
hist(airquality\$fourth.root.Solar.R)

par(mfrow=c(1,2))
hist(airquality\$inventedproportions)
hist(airquality\$arcsinsqrt.inventedproportions)

hist(airquality\$somanyzeros)
hist(airquality\$lnreflected.somanyzeros)

#Residuals are what count for our models.
model.sqrt.Solar.R<-lm(Ozone~sqrt.Solar.R,
data=airquality)
par(mfrow=c(2,2))
plot(model.sqrt.Solar.R)
resid.model.sqrt.Solar.R<-resid(model.sqrt.Solar.R)
par(mfrow=c(1,2))
hist(resid.model.sqrt.Solar.R)
#previous
hist(resid.model.Solar.R)

model.somanyzeros<-lm(Ozone~somanyzeros,
data=airquality)
par(mfrow=c(2,2))
plot(model.somanyzeros)
resid.model.somanyzeros<-resid(model.somanyzeros)

model.lnreflected.somanyzeros<-lm(Ozone~lnreflected.somanyzeros,
data=airquality)
par(mfrow=c(2,2))
plot(model.lnreflected.somanyzeros)
resid.model.lnreflected.somanyzeros<-resid(model.lnreflected.somanyzeros)

#compare
hist(resid.model.lnreflected.somanyzeros)
hist(resid.model.somanyzeros)
#The original is actually better... so remember to look at residuals!

###########################################
#Heteroscedasticity: inequality of variance
###########################################
#code from last lab, but with the new sample data.
pairs(~Ozone+Solar.R+Wind+Temp,
data=airquality)

#density plots with our scatterplots
library(car)
scatterplotMatrix(~Ozone+Solar.R+Wind+Temp, #weird formula here, nothing as "response"/Y, and |indicates grouping variable.
data=airquality,
smoother=FALSE, #default is TRUE, draws a nonparametric smoother.
reg.line=FALSE, #default is TRUE, draws a regression line.
diagonal="density", #puts a density kernel line on the diagonal for each group.
# ellipse=TRUE,
# levels=c(0.5,0.95), #dataEllipse quartiles
pch=c(2,3,1), #rearrange the default points a bit
col=c(1,1,1)) #changed the colors to all black.

#tests for homogeneity of variance

#These only work on groups, so group your Solar.R first.
airquality\$Solar.R.cat<-NA
high<-airquality\$Solar.R<=max(airquality\$Solar.R, na.rm=TRUE)&airquality\$Solar.R>=median(airquality\$Solar.R, na.rm=TRUE)
airquality[high,"Solar.R.cat"]="high"

low<-airquality\$Solar.R>=min(airquality\$Solar.R, na.rm=TRUE)&airquality\$Solar.R<median(airquality\$Solar.R, na.rm=TRUE)
airquality[low,"Solar.R.cat"]="low"
airquality\$Solar.R.cat<-as.factor(airquality\$Solar.R.cat)

bartlett.test(Ozone~Solar.R.cat, #variable as a function of groups
data=airquality )
?bartlett.test #gives options for other tests.

library(car)
leveneTest(Ozone~Solar.R.cat,
data=airquality)
#Less sensitive than Bartlett's to non-normality.
###########################################
#Variations on regression
###########################################
data=airquality)

#cubic model
cubic.model<-lm(Ozone~Temp+I(Temp^2)+I(Temp^3),
data=airquality)
summary(cubic.model)

par(mfrow=c(1,1))
plot(airquality\$Ozone~airquality\$Temp)
ndQ<- data.frame("Temp"=seq(min(airquality\$Temp),
max(airquality\$Temp),
length.out=length(airquality\$Temp)))

lines(ndQ\$Temp,
newdata=ndQ),
lty="solid",
col="black")

lines(ndQ\$Temp,
predict(cubic.model,
newdata=ndQ),
lty="solid",
col="red")

#weighted OLS regression

#Group your data, or use existing groups.  (Here I just use the two groups from before for the airquality data.)

(var.solar<-tapply(airquality\$Ozone, #the variable we're interested in
airquality\$Solar.R.cat, #index (i.e., the grouping variable)
FUN=var, na.rm=TRUE))
solar.wts<-1/var.solar
airquality\$wts<-NA
airquality[airquality\$Solar.R.cat=="high","wts"]=solar.wts[1]
airquality[airquality\$Solar.R.cat=="low","wts"]=solar.wts[2]

wt.lm<-lm(Ozone~Solar.R,
weights=wts,
data=airquality)
summary(wt.lm)
par(mfrow=c(2,2))
plot(wt.lm)
#compare with original
plot(model.Solar.R)
summary(model.Solar.R)

#Or run as ANOVA.
wt.anova<-lm(Ozone~Solar.R.cat,
weights=wts,
data=airquality)
summary(wt.anova)
plot(wt.anova)
#compare with unweighted.
solar.anova<-lm(Ozone~Solar.R.cat,
data=airquality)
summary(solar.anova)
plot(solar.anova)

#For more details on how to do this (including normalizing your weights, which is apparently needed
#if you plan to get confidence intervals), check the top answer here.

###########################################
#Generalized linear models
###########################################
#functions, how to select family and link
glm.wolf<-glm(number~latitude,
data=Depredations,
family=poisson)
summary(glm.wolf)
plot(glm.wolf)

par(mfrow=c(1,1))
plot(Depredations\$number~Depredations\$latitude)
ndW<- data.frame("latitude"=seq(min(Depredations\$latitude),
max(Depredations\$latitude),
length.out=length(Depredations\$latitude)))
#plot the prediction with the new data (otherwise it uses rownumber and stretches the line out uselessly).
lines(ndW\$latitude,
predict.glm(glm.wolf,
newdata=ndW,
type="response"),
lty="solid",lwd=2)

#GLM with a factor.
glm.snails<-glm(Deaths~Rel.Hum*Species,
data=snails,
#select the link manually, although still leaving it at default.
summary(glm.snails)

#quasipoisson suggested because very high deviance to df ratio.
glm.snails.q<-glm(Deaths~Rel.Hum*Species,
data=snails,
family=quasipoisson)
summary(glm.snails.q)

#plot the two lines.
plot(snails\$Deaths[snails\$Species=="A"]~jitter(snails\$Rel.Hum[snails\$Species=="A"]),
pch="A",
cex=0.5,
xlim=c(60,80),
ylim=c(0,20))
points(snails\$Deaths[snails\$Species=="B"]~jitter(snails\$Rel.Hum[snails\$Species=="B"]),
pch="B",
cex=0.5)

ndA<- data.frame("Rel.Hum"=seq(min(snails\$Rel.Hum),
max(snails\$Rel.Hum),
length.out=length(snails\$Rel.Hum)),
"Species"="A")
lines(ndA\$Rel.Hum,
predict.glm(glm.snails.q,
newdata=ndA,
type="response"), #type="response" is necessary for these interesting links.
lty="solid")

ndB<- data.frame("Rel.Hum"=seq(min(snails\$Rel.Hum),
max(snails\$Rel.Hum),
length.out=length(snails\$Rel.Hum)),
"Species"="B")
lines(ndB\$Rel.Hum,
exp(predict.glm(glm.snails.q,
newdata=ndB)),
lty="dotted")
#Just to show that predict.glm is going the same thing as exp(prediction):
lines(ndB\$Rel.Hum,
predict.glm(glm.snails.q,
newdata=ndB,
type="response"),
lty="solid",
col="blue")

(predictions.compared<-data.frame(snails\$Species,
snails\$Deaths,
fitted(glm.snails.q),
predict(glm.snails.q, type="response"),
predict(glm.snails.q))) #note that without type="response" you get the linear predictors only.

#Run as an ANCOVA (analysis of covariance)
Anova(glm.snails.q, type=3)

#evaluation of distribution fit using deviance (see lecture).
#Look at summary(yourmodel)
summary(glm.snails.q)
#Ratio of residual deviance to degrees of freedom as described in lecture.
#You can also look for packages that expand on this topic.

###########################################
#Nonparametric tests
###########################################
#Mann-Whitney/Wilcoxon tests.
wilcox.test(Ozone~Solar.R.cat,
data=airquality)
#pairwise.wilcox.test() if you have paired measurements (i.e. repeated measurements), also known as the Signed-Ranks test.

#For two or MORE samples, use the Kruskal-Wallis rank sum test.
kruskal.test(Ozone~Month,
data=airquality) #for two or more samples.
boxplot(Ozone~Month,
data=airquality,
xlab="Month",
ylab="Ozone in some units")
#Opinions on post-hoc tests vary, but a few packages are available and easily findable via google.