Thursday, May 28, 2015

Savannah Sparrows are so easy to catch

They also tend to have a lot of dirt on their beaks.  Kinda messy critters.  Savannah Sparrow, near Brooks, Newell County, Alberta.  18 May 2015.

Tuesday, May 26, 2015

Friday, May 22, 2015

Field work in full swing!

Field work is in full swing.  I'm back out in Brooks, Alberta, and we're banding Baird's Sparrows, Savannah Sparrows, and Chestnut-collared Longspurs.  Here's one of our beautiful Baird's from yesterday!
Baird's Sparrow, near Brooks, Newell County, Alberta.  21 May 2015.

Tuesday, May 05, 2015

Beginning R: a very brief survey of methods for non-normal data

This was the last lab code for the quantitative methods class I co-taught with Dr. Nicola Koper this winter semester (i.e., the semester I used to know as "spring").

#Start by setting your workspace.  Even though we don't import a file this time,
#it's good practice so any figures you generate will go to the correct folder.
#Otherwise they tend to disappear into that totally obvious folder that you then forget.
######################################################################################
setwd("C:\\Users\\YourUsername\\")
#Your directory here.
#Or use the menu...
#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Today we'll use a sample dataset that's already in R,
#plus generate some random data of our own.
#No need for text files or importing!

#Generate some proportions
#No proportion data, so we'll just add it.
#I generate more data than we need, then cut off <0 and >1.
inventedproportions.prelim<-rnorm(mean=0.5,
                                  sd=0.5,
                                  n=300)
inventedproportions.keep.logical<-inventedproportions.prelim<=1&inventedproportions.prelim>=0
inventedproportions<-inventedproportions.prelim[inventedproportions.keep.logical]
airquality$inventedproportions<-inventedproportions[1:153]

#Generate zero-inflated data (download this package if necessary; I had never used it before).
library(gamlss.dist)

airquality$somanyzeros<-rZIP(n=153,
     mu=5,
     sigma=0.3)


str(airquality)
airquality<-na.omit(airquality)
str(airquality)

###########################################
#Brief review of graphical data exploration
###########################################

par(mfrow=c(2,3))
#Histograms for normality.
hist(airquality$Ozone)
hist(airquality$Solar.R)
hist(airquality$Wind)
hist(airquality$Temp)
hist(airquality$inventedproportions)
hist(airquality$somanyzeros)

#Residuals are what count for our models.
model.Solar.R<-lm(Ozone~Solar.R,
          data=airquality)
par(mfrow=c(2,2))
plot(model.Solar.R)
resid.model.Solar.R<-resid(model.Solar.R)
par(mfrow=c(1,1))
hist(resid.model.Solar.R)

model.somanyzeros<-lm(Ozone~somanyzeros,
                  data=airquality)
par(mfrow=c(2,2))
plot(model.somanyzeros)
resid.model.somanyzeros<-resid(model.somanyzeros)
hist(resid.model.somanyzeros)


###########################################
#Transforming your data
###########################################

airquality$ln.Solar.R<-log(airquality$Solar.R)
#add a constant if there are zeros. 
#pick a constant that transforms your variable best,
#see pp. 65-66 in Quinn and Keough for explanation.
airquality$ln.Solar.R.min<-log(airquality$Solar.R+min(airquality$Solar.R, na.rm=TRUE))
airquality$ln.Solar.R.001<-log(airquality$Solar.R+0.001)
airquality$ln.Solar.R.1<-log(airquality$Solar.R+1)
#Some other transformations.
airquality$Solar.R.inverted<-1/(airquality$Solar.R)
airquality$sqrt.Solar.R<-sqrt(airquality$Solar.R) #square root
airquality$cube.root.Solar.R<-airquality$Solar.R^(1/3) #cube root
airquality$fourth.root.Solar.R<-airquality$Solar.R^(1/4) #fourth root


#Box-Cox transformation.  R only lets you actually run this on a model.
#It's for transforming your response variable ONLY.
library(MASS)
par(mfrow=c(1,2))
Solar.R.boxcox<-boxcox(model.Solar.R)

#Use the graph to zoom in around the mostly likely lambda value.
Solar.R.boxcox.zoom<-boxcox(model.Solar.R,
                            lambda=seq(-0.05,0.35,0.1))
#From the plot, lambda should be about 0.15 (1/6 or 1/7, so a 6th or 7th root transformation.)
airquality$sixth.root.Ozone<-airquality$Ozone^(1/6)

airquality$arcsinsqrt.inventedproportions<-asin(sqrt(airquality$inventedproportions)) #arcsin square root transformation for proportions

#reflecting your data before transforming (pg. 66 in Quinn and Keough)
reflection.constant<-(max(airquality$somanyzeros)+1)
airquality$lnreflected.somanyzeros<-log((reflection.constant-airquality$somanyzeros)+0.001)
#Re-evaluating your data after transformation: run the exploratory analyses again.

par(mfrow=c(1,2))
hist(airquality$Ozone)
hist(airquality$sixth.root.Ozone)

par(mfrow=c(2,4))
hist(airquality$Solar.R)
hist(airquality$ln.Solar.R.min)
hist(airquality$ln.Solar.R.001)
hist(airquality$ln.Solar.R.1)
#Some other transformations.
hist(airquality$Solar.R.inverted)
hist(airquality$sqrt.Solar.R)
hist(airquality$cube.root.Solar.R)
hist(airquality$fourth.root.Solar.R)

par(mfrow=c(1,2))
hist(airquality$inventedproportions)
hist(airquality$arcsinsqrt.inventedproportions)

hist(airquality$somanyzeros)
hist(airquality$lnreflected.somanyzeros)

#Residuals are what count for our models.
model.sqrt.Solar.R<-lm(Ozone~sqrt.Solar.R,
                  data=airquality)
par(mfrow=c(2,2))
plot(model.sqrt.Solar.R)
resid.model.sqrt.Solar.R<-resid(model.sqrt.Solar.R)
par(mfrow=c(1,2))
hist(resid.model.sqrt.Solar.R)
#previous
hist(resid.model.Solar.R)

model.somanyzeros<-lm(Ozone~somanyzeros,
                      data=airquality)
par(mfrow=c(2,2))
plot(model.somanyzeros)
resid.model.somanyzeros<-resid(model.somanyzeros)


model.lnreflected.somanyzeros<-lm(Ozone~lnreflected.somanyzeros,
                      data=airquality)
par(mfrow=c(2,2))
plot(model.lnreflected.somanyzeros)
resid.model.lnreflected.somanyzeros<-resid(model.lnreflected.somanyzeros)

#compare
hist(resid.model.lnreflected.somanyzeros)
hist(resid.model.somanyzeros)
#The original is actually better... so remember to look at residuals!

###########################################
#Heteroscedasticity: inequality of variance
###########################################
#code from last lab, but with the new sample data.
pairs(~Ozone+Solar.R+Wind+Temp,
      data=airquality)

#density plots with our scatterplots
library(car)
scatterplotMatrix(~Ozone+Solar.R+Wind+Temp, #weird formula here, nothing as "response"/Y, and |indicates grouping variable.
                  data=airquality,
                  smoother=FALSE, #default is TRUE, draws a nonparametric smoother.
                  reg.line=FALSE, #default is TRUE, draws a regression line.
                  diagonal="density", #puts a density kernel line on the diagonal for each group.
                  # ellipse=TRUE,
                  # levels=c(0.5,0.95), #dataEllipse quartiles
                  pch=c(2,3,1), #rearrange the default points a bit
                  col=c(1,1,1)) #changed the colors to all black.

#tests for homogeneity of variance
#adapted from: http://www.cookbook-r.com/Statistical_analysis/Homogeneity_of_variance/

#These only work on groups, so group your Solar.R first.
airquality$Solar.R.cat<-NA
high<-airquality$Solar.R<=max(airquality$Solar.R, na.rm=TRUE)&airquality$Solar.R>=median(airquality$Solar.R, na.rm=TRUE)
airquality[high,"Solar.R.cat"]="high"

low<-airquality$Solar.R>=min(airquality$Solar.R, na.rm=TRUE)&airquality$Solar.R<median(airquality$Solar.R, na.rm=TRUE)
airquality[low,"Solar.R.cat"]="low"
airquality$Solar.R.cat<-as.factor(airquality$Solar.R.cat)

bartlett.test(Ozone~Solar.R.cat, #variable as a function of groups
              data=airquality )
?bartlett.test #gives options for other tests.

library(car)
leveneTest(Ozone~Solar.R.cat,
           data=airquality)
#Less sensitive than Bartlett's to non-normality.
###########################################
#Variations on regression
###########################################
#nonlinear relationships (adding a polynomial term), adapted from here.
#The I() function is necessary for R to read your quadratic or cubed term correctly.
quadratic.model<-lm(Ozone~Temp+I(Temp^2),
                    data=airquality)
summary(quadratic.model)

#cubic model
cubic.model<-lm(Ozone~Temp+I(Temp^2)+I(Temp^3),
                    data=airquality)
summary(cubic.model)

par(mfrow=c(1,1))
plot(airquality$Ozone~airquality$Temp)
ndQ<- data.frame("Temp"=seq(min(airquality$Temp),
                            max(airquality$Temp),
                            length.out=length(airquality$Temp)))

lines(ndQ$Temp,
      predict(quadratic.model,
              newdata=ndQ),
      lty="solid",
      col="black") 

lines(ndQ$Temp,
      predict(cubic.model,
              newdata=ndQ),
      lty="solid",
      col="red") 

anova(quadratic.model,cubic.model)

#weighted OLS regression
#Additional info.

#Group your data, or use existing groups.  (Here I just use the two groups from before for the airquality data.)

(var.solar<-tapply(airquality$Ozone, #the variable we're interested in
                      airquality$Solar.R.cat, #index (i.e., the grouping variable)
                      FUN=var, na.rm=TRUE))
solar.wts<-1/var.solar
airquality$wts<-NA
airquality[airquality$Solar.R.cat=="high","wts"]=solar.wts[1]
airquality[airquality$Solar.R.cat=="low","wts"]=solar.wts[2]

wt.lm<-lm(Ozone~Solar.R,
          weights=wts,
          data=airquality)
summary(wt.lm)
par(mfrow=c(2,2))
plot(wt.lm)
#compare with original
plot(model.Solar.R)
summary(model.Solar.R)

#Or run as ANOVA.
wt.anova<-lm(Ozone~Solar.R.cat,
             weights=wts,
             data=airquality)
summary(wt.anova)
plot(wt.anova)
#compare with unweighted.
solar.anova<-lm(Ozone~Solar.R.cat,
                data=airquality)
summary(solar.anova)
plot(solar.anova)

#For more details on how to do this (including normalizing your weights, which is apparently needed
#if you plan to get confidence intervals), check the top answer here.

###########################################
#Generalized linear models
###########################################
#functions, how to select family and link
glm.wolf<-glm(number~latitude,
              data=Depredations,
              family=poisson)
#uses default link for family.
summary(glm.wolf)
plot(glm.wolf)

par(mfrow=c(1,1))
plot(Depredations$number~Depredations$latitude)
ndW<- data.frame("latitude"=seq(min(Depredations$latitude),
                               max(Depredations$latitude),
                               length.out=length(Depredations$latitude)))
#plot the prediction with the new data (otherwise it uses rownumber and stretches the line out uselessly).
lines(ndW$latitude,
      predict.glm(glm.wolf,
                      newdata=ndW,
                  type="response"),
      lty="solid",lwd=2)

#GLM with a factor.
glm.snails<-glm(Deaths~Rel.Hum*Species,
                data=snails,
              family=poisson(link="log"))
#select the link manually, although still leaving it at default.
summary(glm.snails)

#quasipoisson suggested because very high deviance to df ratio.
glm.snails.q<-glm(Deaths~Rel.Hum*Species,
                data=snails,
                family=quasipoisson)
summary(glm.snails.q)

#plot the two lines.
plot(snails$Deaths[snails$Species=="A"]~jitter(snails$Rel.Hum[snails$Species=="A"]),
     pch="A",
     cex=0.5,
     xlim=c(60,80),
     ylim=c(0,20))
points(snails$Deaths[snails$Species=="B"]~jitter(snails$Rel.Hum[snails$Species=="B"]),
     pch="B",
     cex=0.5)

ndA<- data.frame("Rel.Hum"=seq(min(snails$Rel.Hum),
                            max(snails$Rel.Hum),
                            length.out=length(snails$Rel.Hum)),
                        "Species"="A")
lines(ndA$Rel.Hum,
      predict.glm(glm.snails.q,
                  newdata=ndA,
                  type="response"), #type="response" is necessary for these interesting links.
      lty="solid") 

ndB<- data.frame("Rel.Hum"=seq(min(snails$Rel.Hum),
                            max(snails$Rel.Hum),
                            length.out=length(snails$Rel.Hum)),
                 "Species"="B")
lines(ndB$Rel.Hum,
      exp(predict.glm(glm.snails.q,
                  newdata=ndB)),
      lty="dotted")
#Just to show that predict.glm is going the same thing as exp(prediction):
lines(ndB$Rel.Hum,
      predict.glm(glm.snails.q,
                  newdata=ndB,
                  type="response"),
      lty="solid",
      col="blue")

(predictions.compared<-data.frame(snails$Species,
                                 snails$Deaths,
                                 fitted(glm.snails.q),
                                 predict(glm.snails.q, type="response"),
                                 predict(glm.snails.q))) #note that without type="response" you get the linear predictors only.

#Run as an ANCOVA (analysis of covariance)
Anova(glm.snails.q, type=3)

#More info on Poisson regression specifically.

#evaluation of distribution fit using deviance (see lecture).
#Look at summary(yourmodel)
summary(glm.snails.q)
#Ratio of residual deviance to degrees of freedom as described in lecture.
#You can also look for packages that expand on this topic.

###########################################
#Nonparametric tests
###########################################
#Mann-Whitney/Wilcoxon tests.
wilcox.test(Ozone~Solar.R.cat,
            data=airquality)
#pairwise.wilcox.test() if you have paired measurements (i.e. repeated measurements), also known as the Signed-Ranks test.

#For two or MORE samples, use the Kruskal-Wallis rank sum test.
kruskal.test(Ozone~Month,
             data=airquality) #for two or more samples.
boxplot(Ozone~Month,
        data=airquality,
        xlab="Month",
        ylab="Ozone in some units")
#Opinions on post-hoc tests vary, but a few packages are available and easily findable via google.