This is part 2 of a previous post on linear models, so you'll need to start with the previous code to get the dataframe and variables used in this post.

#############################

#####Linear models: ANOVA####

#############################

#ANOVA is a type of linear model with a categorical independent variable

#and a particular analysis for partitioning variance. (Hence the name,

#Analysis O VAriance.)

#Use the same formulas and function as before but with categorical variables.

state.anova<-lm(wingchordr~state,

data=titmouse.na.omit)

summary(state.anova)

#This is the same thing as a t-test with only two levels.

t.test(titmouse.na.omit$wingchordr~as.numeric(titmouse.na.omit$state),

var.equal=TRUE)

titmouse.na.omit$species<-as.factor(titmouse.na.omit$species)

#Change species to a factor so we can compare these three titmouse populations.

crest.a<-lm(crestlength~species,

data=titmouse.na.omit)

summary(crest.a)

anova(crest.a)

#Base R (without packages) has this anova function, but does it do what we want?

?anova

#not super obvious which type of ANOVA though.

#Let's use an easier package, car, which we used earlier.

#It doesn't hurt to call a library twice so we'll put it here anyway in case we want to run this code separately.

library(car) #You'll note it doesn't give any messages in the R console if we've already run it in this session.

Anova(crest.a, type="III")

#Different types of ANOVA.

Anova(crest.a, type=3) #You can use numbers (without quotes) or Roman numerals (with quotes)

Anova(crest.a, type=2)

?Anova #Help lets us know that Type II and Type III do not exactly correspond to those in SAS, but have similar principles.

#It also lets us know that anova() in base R (without capitalization) is a type I test.

#More information on the different types as implemented in R compared with other standard programs (SAS and SPSS).

#Let's look at the spread of boxplots/points in each group, then at residuals.

plot(titmouse.na.omit$crestlength~titmouse.na.omit$species,

xlab="Species",

ylab="Crest length (mm)")

#Just to see what the points themselves look like.

plot(titmouse.na.omit$crestlength~jitter(as.numeric(titmouse.na.omit$species),

factor=0.05), #Add a bit of jitter so you can see all the points.

xlab="Species",

ylab="Crest length (mm)",

pch=21,

bg="black",

xaxt="n") #Stops x-axis from receiving numbered labels (the factors' numbers).

axis(side=1, #Add in the custom x-axis

at=1:3, #at positions 1 through 3

labels=c("Black-crested",

"hybrid",

"Tufted")) #with appropriate labels.

#What about the residuals? (section 8.3.2 of textbook)

plot(residuals(crest.a)~titmouse.na.omit$species,

xlab="Species",

ylab="residuals")

#Interactions in ANOVA

#Code this the same way you would code

crest.sp.z.a<-lm(crestlength~species*zone,

data=titmouse.na.omit)

summary(crest.sp.z.a)

Anova(crest.sp.z.a,

type="III")

#Unplanned comparisions, i.e., post hoc tests.

#Fisher's LSD

library(agricolae)

crest.LSD<-LSD.test(crest.a, #linear model that contains what you want to test

"species", #groups to test

p.adj="none", #none for standard Fisher's LSD

alpha=0.05, #alpha=0.05

console=TRUE #show your output.

)

#plot the group letters.

plot(titmouse.na.omit$crestlength~titmouse.na.omit$species,

xlab="Species",

ylab="Crest length (mm)",

ylim=c(15,25))

for (i in 1:3) { #Another loop, this time with three repeats.

text(x=i, #Different function, put i where you'd put 1:3.

y=24, #Place all at 24 on the y axis.

labels=crest.LSD$groups[3][i,])#label with the group results from the Fisher's LSD list.

}

#Bonferroni adjustment

pairwise.t.test(titmouse.na.omit$crestlength,

g=titmouse.na.omit$species,

p.adjust.method="bonf")

#Same results in a different format.

crest.lsd.bonf<-LSD.test(crest.a, #linear model

"species", #groups to test

p.adj="bonf", #Bonferroni correction

alpha=0.05, #alpha=0.05

console=TRUE #show your output.

)

#Tukey's Honest Significant Differences

library(multcomp)

crest.tukey<-glht(crest.a,

linfct=mcp(species="Tukey"))

summary(crest.tukey)

#There is also a base R function TukeyHDS() but I find this one simpler to read.

#Tukey's HSD not mentioned in lecture, but a good one.

#Use the same function to get Dunnett's test, which compares specifically to one level (often a control).

#Here our data are a bit weird for this because there's no reason to compare to what R thinks is the control

#which is the first factor (Black-crested, because it's alphabetical).

crest.dunnett<-glht(crest.a,

linfct=mcp(species="Dunnett"))

summary(crest.dunnett)

#A priori comparisions are planned comparisions and must be coded before the analysis.

#This is the clearest explanation of contrasts in R that I've found:

#http://www.uvm.edu/~dhowell/StatPages/More_Stuff/R/ContrastsInR.html

levels(titmouse.na.omit$species)#Check the ordering of your factor levels.

#In the matrix, the first one will be Black-crested, second will be hybrid, and third will be Tufted.

#Let's look at R-provided contrast matrices.

#There are functions to generate each matrix, then you use contrasts() to set it for your factor.

(sp.treatment<-contr.treatment(levels(titmouse.na.omit$species),

contrasts=TRUE))

apply(sp.treatment,2,sum) #check that column sums are zero (orthogonal contrasts are independent).

#It's not for contr.treatment (not orthogonal).

contrasts(titmouse.na.omit$species)<-sp.treatment #Apply this constrast matrix to your variable.

crest.contr1<-lm(crestlength~species,

data=titmouse.na.omit,

contrasts=TRUE)

Anova(crest.contr1, type='III')

summary(crest.contr1)

#You'll note this is the default for R. It compares to the first ("base") level.

sp.treatment<-contr.treatment(levels(titmouse.na.omit$species),

base=2, #Change the base to 2 to compare to 'hybrid'.

contrasts=TRUE)

apply(sp.treatment,2,sum) #check that column sums are zero (orthogonal contrasts are independent).

#It's not for contr.treatment (not orthogonal).

contrasts(titmouse.na.omit$species)<-sp.treatment

crest.contr2<-lm(crestlength~species,

data=titmouse.na.omit,

contrasts=TRUE)

Anova(crest.contr2, type='III')

summary(crest.contr2)

#Now it compares everything to level 2 (hybrids).

#A different example:

sp.helmert<-contr.helmert(levels(titmouse.na.omit$species),

contrasts=TRUE)

apply(sp.helmert,2,sum) #check that column sums are zero (orthogonal contrasts are independent).

contrasts(titmouse.na.omit$species)<-sp.helmert

crest.contr3<-lm(crestlength~species,

data=titmouse.na.omit,

contrasts=TRUE)

Anova(crest.contr3, type='III')

summary(crest.contr3)

#You'll notice the ANOVA tables are the same, but the summary with contrasts are different.

#More types of contrasts and their definitions are here:

#http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm

?contr.treatment #or contr.helmert, contr.poly, contr.sum, contr.SAS will all get you to the contrast matrix function help file.

#Custom contrasts are also possible.

#You make a matrix with an additional row of 1s for a constant/intercept (it's removed later).

(cmat.solve<-rbind(c(1,1,1), #rbind binds together rows.

'hybrid vs. parental'=c(1,-2,1), #label each comparison and make each list sum to zero.

'BCTI vs. TUTI'=c(-1,0,1)))

library(MASS) #invoke the MASS library to use fractions().

#Doesn't change your result, just makes it easier to read.

#This library comes installed with R, just needs called.

(test.solve<-fractions(solve(cmat.solve))) #Inverse of the matrix.

contrasts(titmouse.na.omit$species)<-test.solve[,2:3] #Select all but the intercept and apply it to contrasts().

apply(contrasts(titmouse.na.omit$species), 2, sum) # check to make sure sums are zero.

#If zero, then orthogonal and this method works. It's essentially zero with rounding error.

crest.contr4<-lm(crestlength~species,

data=titmouse.na.omit,

contrasts=TRUE)

Anova(crest.contr4, type='III')

summary(crest.contr4)

## Tuesday, March 10, 2015

## Tuesday, March 03, 2015

### Beginning R: linear models part 1 of 2 (simple and multiple regression) and looping

More code from the University of Manitoba class in quantitative methods that I'm co-teaching with my post-doc mentor, Dr. Nicola Koper.

#Start by setting your workspace and bringing in your data,

#and checking to make sure the variables are imported correctly

#with numbers as numeric, categories as factors, etc.

setwd("C:\\Users\\Username\\Dropbox\\")

#Your directory here.

#Or use the menu...

#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Import your data.

titmouse<- read.table(file="blog_banding_data.csv", header=TRUE, sep=",")

str(titmouse) #shows the data types; always do this check.

titmouse$species<-titmouse$indexsum # create a new column with indexsum data copied.

#Create a categorical species variable.

titmouse$species<-sub(0,"Tufted",titmouse$species)

titmouse$species<-sub(6,"Black-crested",titmouse$species)

titmouse[titmouse$species>0&titmouse$species<6,]$species = "hybrid"

#We need to omit the NA values for the dataEllipse function to work well and for simplifying various other plots and analyses,

#so create a new dataframe with all the variables that we'll use from titmouse.

titmouse.na.omit<-data.frame(na.omit(titmouse[,c("wingchordr",

"taillength",

"crestlength",

"billlength",

"indexsum",

"species",

"state",

"zone")]))

#############################

##Correlation and covariance#

#############################

#How to test for correlation in the data.

cor(x=titmouse.na.omit$crestlength,

y=titmouse.na.omit$indexsum,

use="pairwise.complete.obs", #Use options are given in ?cor and specifies what to do with missing values (NA). Here we're using the na.omit data so it's irrelevant.

method="pearson")

cor.test(x=titmouse.na.omit$crestlength,

y=titmouse.na.omit$indexsum,

na.action=na.rm, #Just to remind you it's here, titmouse.na.omit has no NAs.

method="pearson") #other methods are available, see the help file and examples below.

#Correlate more than one at a time, though you only get the correlation coefficient.

cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],

use="pairwise.complete.obs", #Use options are given in ?cor and specifies what to do with missing values.

method="spearman") #Example of another correlation method, this one non-parametric.

cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],

use="pairwise.complete.obs", #Use options are given in ?cor and specifies what to do with missing values.

method="pearson")

plot(wingchordr~taillength,

data=titmouse,

xlab="Tail length (mm)",

ylab="Wing chord (right) (mm)")

#Because correlation does not imply indepedence and dependence,

#it doesn't matter which is x and y when plotted.

plot(taillength~wingchordr,

xlab="Wing chord (right) (mm)",

ylab="Tail length (mm)",

data=titmouse)

#Make a plot with data ellipses.

library(car) #Call the 'car' library that contains the dataEllipse() function.

#From help: "dataEllipse superimposes the normal-probability contours over a scatterplot of the data."

coords.ellipse<-dataEllipse(

x=titmouse.na.omit$wingchordr,

xlab="Wing chord (right) (mm)",

y=titmouse.na.omit$taillength,

ylab="Tail length (mm)",

levels=c(0.68, 0.95, 0.975), #levels are quantiles. 0.68 is about one standard deviation assuming normality.

lty=2, #Dashed lines

lwd=1, #width=1

fill=TRUE, fill.alpha=0.1, #Fill ellipses with transparent color.

grid=FALSE, #No background grid.

draw=TRUE, #change to TRUE to show on plot.

plot.points=FALSE, #Add the points back over the plot.

center.pch=FALSE, #No centroid point shown.

add=FALSE, #Don't add to the previous plot we made, which were the points for the correlation.

col="black", #black borders

pch=21, #filled points

bg="black") #black fill

#Or, fill the ellipses manually with polygon().

plot(taillength~wingchordr,

data=titmouse,

xlab="Wing chord (right) (mm)",

ylab="Tail length (mm)")

polygon(x=coords.ellipse[[1]][,"x"], #Get the ellipse coordinates from the object we created previously.

y=coords.ellipse[[1]][,"y"],

col=adjustcolor("darkgray",alpha.f=0.9),

border=NA)

polygon(x=coords.ellipse[[2]][,"x"],

y=coords.ellipse[[2]][,"y"],

col=adjustcolor("gray",alpha.f=0.8),

border=NA)

polygon(x=coords.ellipse[[3]][,"x"],

y=coords.ellipse[[3]][,"y"],

col=adjustcolor("gray",alpha.f=0.3),

border=NA)

#Add points back over the polygons.

points(taillength~wingchordr,

data=titmouse,

pch=21,

bg="black")

#############################

##Linear models: regression##

#############################

#R's linear regression is based on the lm() function. This is the general linear model.

#The glm() function, confusingly, is the GENERALIZED linear model

#(which we'll use later in the semester; it allows for use of non-normal distributions).

#So, this time we only use lm().

regression.bw<-lm(crestlength~wingchordr,

data=titmouse.na.omit)

#Use this notation with clean variable names and the data argument,

#as it inexplicably messes up confidence limit predictions later if you don't. Don't ask me why!

summary(regression.bw)

#Slope=-0.10926, i.e. 0.10926 decrease in crestlength for each unit of wing chord.

#Slope from lecture: "beta = "value" = parameter estimate = slope"

##Multiple R-squared, from lecture: "How much of the variance in our model is explained by our independent variables?"

#intercept is where x=0 as you can see from the plot below.

plot(crestlength~wingchordr,

data=titmouse,

xlim=c(0,90),

ylim=c(0,50),

xlab="Crest length (mm)",

ylab="Wing chord (right) (mm)")

abline(regression.bw, v=0)

text(x=10,

y=regression.bw$coef[1]+5,

labels=c("Intercept")) #Add some text nearby the intercept to point it out.

#Getting data from your regression model.

coef(regression.bw) #only extracts the coefficients B0 and B1 (intercept and slope).

fitted.values(regression.bw) #extract the fitted values.

#Reminder of how to get diagnostic plots:

plot(regression.bw)

#And here's how to get out residuals if you want to analyze them beyond the default diagnostic plots"

residuals(regression.bw) #Raw residuals

rstudent(regression.bw) #Studentized residuals

#Analyze your diagnostics following the lab from 27 Jan. 2015.

#Plotting and confidence intervals

#Get confidence intervals for parameters.

confint(regression.bw, level=0.95) #for parameter confidence intervals, change % with level.

#Estimate confidence and prediction values for a series of evenly spaced new x values

nd<- data.frame("wingchordr"=seq(min(titmouse.na.omit$wingchordr),

max(titmouse.na.omit$wingchordr), #Range min and max of your values using seq()

length.out=length(titmouse.na.omit$wingchordr))) #same length as original data

conf <- predict(regression.bw,interval="confidence",newdata=nd) #Use the predict function to get confidence intervals

pred <- predict(regression.bw,interval="prediction",newdata=nd) #Use the predict function to get prediction intervals

yhat <- predict(regression.bw,newdata=nd) #Use the predict function to get predicted values

#Plotting everything together:

plot(crestlength~wingchordr,

xlab="Wing chord (right) (mm)",

ylab="Crest length (mm)",

data=titmouse.na.omit) ## data

abline(regression.bw) ## fit

lines(nd$wingchordr,pred[,c("lwr")],col="red",lty=2) #Plot the prediction intervals, which predicts where new data may be sampled.

lines(nd$wingchordr,pred[,c("upr")],col="red",lty=2)

lines(nd$wingchordr,conf[,c("lwr")],col="red",lty=1) #Plot the confidence intervals, which show where the mean may be.

lines(nd$wingchordr,conf[,c("upr")],col="red",lty=1)

lines(nd$wingchordr,yhat, col="blue", lwd=2) #The fit line only for your range of data.

#Or add a shaded fill if you prefer (as shown last week in lecture)

polygon(x=c(nd$wingchordr, rev(nd$wingchordr)),

y=c(conf[,c("lwr")],rev(conf[,c("upr")])),

col=adjustcolor("gray",alpha.f=0.2),

border=NA)

#Code from http://stackoverflow.com/questions/12544090/predict-lm-in-r-how-to-get-confidence-intervals-right

#Differences between confidence intervals (around mean) and prediction intervals (where next data point is likely from):

#http://blog.minitab.com/blog/adventures-in-statistics/when-should-i-use-confidence-intervals-prediction-intervals-and-tolerance-intervals

#http://www.graphpad.com/support/faqid/1506/

#####################

#Multiple regression#

#####################

regression.cwh<-lm(crestlength~wingchordr+indexsum, #simply add the additional variable.

data=titmouse)

summary(regression.cwh,

correlation=TRUE) #Show the correlation of the coefficients.

#Each variable means the effect of that variable holding the remaining variables constant.

plot(regression.cwh) #Get diagnostics again!

#Interactions in multiple regression

#Each non-interacting parameter now means the effect of that variable at X=0 (only at the intercept).

regression.cwhi1<-lm(crestlength~wingchordr*indexsum,

data=titmouse)

summary(regression.cwhi1,

correlation=TRUE)

#A different notation with identical results.

regression.cwhi2<-lm(crestlength~wingchordr+indexsum+wingchordr:indexsum,

data=titmouse)

summary(regression.cwhi2,

correlation=TRUE) #correlation=TRUE returns the correlation matrix of parameters.

plot(regression.cwhi2) #Check the diagnostic plots.

#To get a simple 3D plot, call the 'lattice' package.

library(lattice)

#Save the defaults so it doesn't mess up your plot settings later.

pardefault <- par(no.readonly = T)

cloud(crestlength~wingchordr*indexsum,

data=titmouse.na.omit)

#It's kinda hard to tell where the points are though.

library(scatterplot3d)

#Get a plot with points and lines connecting to base grid.

s3d <-scatterplot3d(x=titmouse.na.omit$wingchordr,

y=titmouse.na.omit$indexsum,

z=titmouse.na.omit$crestlength,

xlab="Wing chord (right) (mm)",

ylab="Hybrid index",

zlab="Crest length (mm)",

pch=16,

highlight.3d=TRUE,

type="h",

main="3D Scatterplot")

s3d$plane3d(regression.cwh) #Fits a linear plane over the plot, does not use the interaction.

#There are lots of ways to fit 3d surfaces.

#Additional plotting methods can be found here:

#http://www.statmethods.net/graphs/scatterplot.html

#Plot interactions with lines.

par(pardefault) #scatterplot 3D messes up something so we call pardefault back.

plot(crestlength~wingchordr,

data=titmouse.na.omit[titmouse.na.omit$indexsum==0,],

xlab="Wing chord (right) (mm)",

ylab="Crest length (mm)",

xlim=c(min(titmouse.na.omit$wingchordr),#These next four lines ensure no points get left off later.

max(titmouse.na.omit$wingchordr)),

ylim=c(min(titmouse.na.omit$crestlength),

max(titmouse.na.omit$crestlength)))

#Learn how to loop.

for (i in 1:6 ) { #i is what goes in the loop. The series of numbers are your values.

points(crestlength~wingchordr, #Put your functions/commands inside the brackets.

data=titmouse.na.omit[titmouse.na.omit$indexsum==i,], #Note that i goes in place of the number (0 in plot)

pch=21,

bg=i+1) #Another i here.

}

#The colors are funky though.

palette() #Check the color list.

default.colors<-palette() #Save the default palette.

(gray<-gray.colors(7, start=1, end=0)) #Create a gray scale.

#creates standard HYBRID INDEX Gray palette that has white (1) through black (7)

#for colors. Can be used to label figures with indexsum+1 (bg/col won't read 0).

gray.palette<-palette(gray) #Set the palette to gray and save it as an option like you did for default.

palette() #shows palette color names.

#Run the loop again and see how much easier it is to read.

for (i in 1:6 ) { #i is what goes in the loop. The series of numbers are your values.

points(crestlength~wingchordr, #Put your functions/commands inside the brackets.

data=titmouse.na.omit[titmouse.na.omit$indexsum==i,], #Note that i goes in place of the number (0 in plot)

pch=21,

bg=i+1) #You use +1 because 0-6 is the hybrid index but 1-7 matches the gray palette we made.

}

#reset back to default if desired.

palette(default.colors)

palette()

#Add a legend.

legend(title="Hybrid index values",

title.adj=0,

x=69.2, y=17.5, #Where the upper right corner of the legend is located on the coordinates of the plot.

ncol=7, #Number of columns. ncol=7 makes the legend horizontal, essentially.

xjust=0, bty="n", #How the legend is justified (left, right, center). Check ?legend for details. bty= box type. "n"= none.

x.intersp=0.75, y.intersp=1, #Character and line spacing.

text.width=0.5, #Width of legend text.

legend = c(0,1,2,3,4,5,6), #Values in the legend. Can be text if in quotes.

pt.lwd=2, #Line width around each point.

pt.bg=gray, #Background color for each point, here taken from the gray object (a set of color values).

pch=c(21,21,21,21,21,21), #Each of the seven points should be type 21 (which has a background (bg) fill)

cex=0.8, pt.cex=1.25) #Adjust the size of the text and points relative to the rest of the plot. Cex means character expansion factor.

#Adjust legend location as needed for your screen size or plot area.

########Adding lines for the interactions

#To plot the lines, for each value of the other variable, we need to use predict.lm()

#for each value of indexsum (0-6).

#We'll make a loop that assigns new predictive dataframes to items 1-7 in a list

#(equivalent to index 0-6, 7 items, but the list can't have a 0th item).

#Then it takes those values (if you look in the list the range of wingchordr

#is always the same, indexsum=the current value of interest).

#Then we use predict.lm to give us predicted values at each desired value of indexsum.

#Create an empty list to store values.

ndm<-list()

#Loop through the fit code.

for (i in 1:7){

ndm[[i]]<- data.frame("wingchordr"=seq(min(titmouse.na.omit$wingchordr),

max(titmouse.na.omit$wingchordr),

length.out=length(titmouse.na.omit$wingchordr)),

"indexsum"=i-1) #indexsum values go 0-6 but the list can't have a 0th item, only 1st-7th.

lines(predict.lm(regression.cwhi1, newdata=ndm[[i]]), col="black")

}

#Add some labels showing that the top regression line is for indexsum=6

#and the bottom line is indexsum=0

text("Black-crested (6)", x=85, y=21.5, cex=0.5)

text("Tufted (0)", x=85, y=18, cex=0.5)

#You can see in the plot that there are no interactions, which matches with

#the summary statistics... in this model, there was no significant effect of wingchordr

#or indexsum on crestlength, nor was the interaction significant.

#You can predict individual points as well.

#In class we used an Excel spreadsheet to manually calculate the Y (crestlength) values.

#You can create one yourself with the intercept and slope values and interactions to see get the same values.

#The following R code shows that manual calculations match up with what R is doing.

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=70,"indexsum"=0))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=70,"indexsum"=6))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=80,"indexsum"=0))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=80,"indexsum"=6))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=90,"indexsum"=0))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=90,"indexsum"=6))

#Yup, the values match (ignoring rounding errors)!

#Lots of additional ways (using base R and more packages) to look at your multiple regression:

#http://colbyimaging.duckdns.org:8080/wiki/statistics/multiple-regression

#########################################

#Other variations on regression are major axis (MA) and

#reduced major axis (RMA) regressions, and loess smoothing.

#Loess smoothing

line.loess<-loess(titmouse.na.omit$billlength~titmouse.na.omit$wingchordr,

span=0.75, #degree of smoothing

family="gaussian", #fitting by least squares

degree=2) #polynomial degree, usually 1 or 2.

#Plot the points and smoothing line.

order.loess<-order(titmouse.na.omit$wingchordr) #orders lines so curve is smooth

plot(billlength~wingchordr,

data=titmouse.na.omit,

xlab="Wing chord (right) (mm)",

ylab="Bill length (mm)")

lines(titmouse.na.omit$wingchordr[order.loess],line.loess$fitted[order.loess]) #Plot the loess smooth on your scatterplot.

#For more information on setting parameters and appropriate use of loess curves, see

#CLEVELAND, W. S. 1979. Robust locally weighted regression and smoothing scatterplots. J. Am. Stat. Assoc., 74:829-836.

#For other types of regression, check out the 'lmodel2' package and set regression type via "method=" argument.

#Be careful of the names of the regressions as they vary between books;

# look at the details to make sure you get the

#type of regression you want.

#Finally, you can use ANOVA for linear regression (with all continuous variables).

anova(regression.cwh)

#Since we use type I here, each term adds in sequentially.

#The model improves significantly when we add in hybrid index.

#This makes sense when we get differences by species in our regular ANOVA next week!

#Start by setting your workspace and bringing in your data,

#and checking to make sure the variables are imported correctly

#with numbers as numeric, categories as factors, etc.

setwd("C:\\Users\\Username\\Dropbox\\")

#Your directory here.

#Or use the menu...

#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Import your data.

titmouse<- read.table(file="blog_banding_data.csv", header=TRUE, sep=",")

str(titmouse) #shows the data types; always do this check.

titmouse$species<-titmouse$indexsum # create a new column with indexsum data copied.

#Create a categorical species variable.

titmouse$species<-sub(0,"Tufted",titmouse$species)

titmouse$species<-sub(6,"Black-crested",titmouse$species)

titmouse[titmouse$species>0&titmouse$species<6,]$species = "hybrid"

#We need to omit the NA values for the dataEllipse function to work well and for simplifying various other plots and analyses,

#so create a new dataframe with all the variables that we'll use from titmouse.

titmouse.na.omit<-data.frame(na.omit(titmouse[,c("wingchordr",

"taillength",

"crestlength",

"billlength",

"indexsum",

"species",

"state",

"zone")]))

#############################

##Correlation and covariance#

#############################

#How to test for correlation in the data.

cor(x=titmouse.na.omit$crestlength,

y=titmouse.na.omit$indexsum,

use="pairwise.complete.obs", #Use options are given in ?cor and specifies what to do with missing values (NA). Here we're using the na.omit data so it's irrelevant.

method="pearson")

cor.test(x=titmouse.na.omit$crestlength,

y=titmouse.na.omit$indexsum,

na.action=na.rm, #Just to remind you it's here, titmouse.na.omit has no NAs.

method="pearson") #other methods are available, see the help file and examples below.

#Correlate more than one at a time, though you only get the correlation coefficient.

cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],

use="pairwise.complete.obs", #Use options are given in ?cor and specifies what to do with missing values.

method="spearman") #Example of another correlation method, this one non-parametric.

cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],

use="pairwise.complete.obs", #Use options are given in ?cor and specifies what to do with missing values.

method="pearson")

plot(wingchordr~taillength,

data=titmouse,

xlab="Tail length (mm)",

ylab="Wing chord (right) (mm)")

#Because correlation does not imply indepedence and dependence,

#it doesn't matter which is x and y when plotted.

plot(taillength~wingchordr,

xlab="Wing chord (right) (mm)",

ylab="Tail length (mm)",

data=titmouse)

#Make a plot with data ellipses.

library(car) #Call the 'car' library that contains the dataEllipse() function.

#From help: "dataEllipse superimposes the normal-probability contours over a scatterplot of the data."

coords.ellipse<-dataEllipse(

x=titmouse.na.omit$wingchordr,

xlab="Wing chord (right) (mm)",

y=titmouse.na.omit$taillength,

ylab="Tail length (mm)",

levels=c(0.68, 0.95, 0.975), #levels are quantiles. 0.68 is about one standard deviation assuming normality.

lty=2, #Dashed lines

lwd=1, #width=1

fill=TRUE, fill.alpha=0.1, #Fill ellipses with transparent color.

grid=FALSE, #No background grid.

draw=TRUE, #change to TRUE to show on plot.

plot.points=FALSE, #Add the points back over the plot.

center.pch=FALSE, #No centroid point shown.

add=FALSE, #Don't add to the previous plot we made, which were the points for the correlation.

col="black", #black borders

pch=21, #filled points

bg="black") #black fill

#Or, fill the ellipses manually with polygon().

plot(taillength~wingchordr,

data=titmouse,

xlab="Wing chord (right) (mm)",

ylab="Tail length (mm)")

polygon(x=coords.ellipse[[1]][,"x"], #Get the ellipse coordinates from the object we created previously.

y=coords.ellipse[[1]][,"y"],

col=adjustcolor("darkgray",alpha.f=0.9),

border=NA)

polygon(x=coords.ellipse[[2]][,"x"],

y=coords.ellipse[[2]][,"y"],

col=adjustcolor("gray",alpha.f=0.8),

border=NA)

polygon(x=coords.ellipse[[3]][,"x"],

y=coords.ellipse[[3]][,"y"],

col=adjustcolor("gray",alpha.f=0.3),

border=NA)

#Add points back over the polygons.

points(taillength~wingchordr,

data=titmouse,

pch=21,

bg="black")

#############################

##Linear models: regression##

#############################

#R's linear regression is based on the lm() function. This is the general linear model.

#The glm() function, confusingly, is the GENERALIZED linear model

#(which we'll use later in the semester; it allows for use of non-normal distributions).

#So, this time we only use lm().

regression.bw<-lm(crestlength~wingchordr,

data=titmouse.na.omit)

#Use this notation with clean variable names and the data argument,

#as it inexplicably messes up confidence limit predictions later if you don't. Don't ask me why!

summary(regression.bw)

#Slope=-0.10926, i.e. 0.10926 decrease in crestlength for each unit of wing chord.

#Slope from lecture: "beta = "value" = parameter estimate = slope"

##Multiple R-squared, from lecture: "How much of the variance in our model is explained by our independent variables?"

#intercept is where x=0 as you can see from the plot below.

plot(crestlength~wingchordr,

data=titmouse,

xlim=c(0,90),

ylim=c(0,50),

xlab="Crest length (mm)",

ylab="Wing chord (right) (mm)")

abline(regression.bw, v=0)

text(x=10,

y=regression.bw$coef[1]+5,

labels=c("Intercept")) #Add some text nearby the intercept to point it out.

#Getting data from your regression model.

coef(regression.bw) #only extracts the coefficients B0 and B1 (intercept and slope).

fitted.values(regression.bw) #extract the fitted values.

#Reminder of how to get diagnostic plots:

plot(regression.bw)

#And here's how to get out residuals if you want to analyze them beyond the default diagnostic plots"

residuals(regression.bw) #Raw residuals

rstudent(regression.bw) #Studentized residuals

#Analyze your diagnostics following the lab from 27 Jan. 2015.

#Plotting and confidence intervals

#Get confidence intervals for parameters.

confint(regression.bw, level=0.95) #for parameter confidence intervals, change % with level.

#Estimate confidence and prediction values for a series of evenly spaced new x values

nd<- data.frame("wingchordr"=seq(min(titmouse.na.omit$wingchordr),

max(titmouse.na.omit$wingchordr), #Range min and max of your values using seq()

length.out=length(titmouse.na.omit$wingchordr))) #same length as original data

conf <- predict(regression.bw,interval="confidence",newdata=nd) #Use the predict function to get confidence intervals

pred <- predict(regression.bw,interval="prediction",newdata=nd) #Use the predict function to get prediction intervals

yhat <- predict(regression.bw,newdata=nd) #Use the predict function to get predicted values

#Plotting everything together:

plot(crestlength~wingchordr,

xlab="Wing chord (right) (mm)",

ylab="Crest length (mm)",

data=titmouse.na.omit) ## data

abline(regression.bw) ## fit

lines(nd$wingchordr,pred[,c("lwr")],col="red",lty=2) #Plot the prediction intervals, which predicts where new data may be sampled.

lines(nd$wingchordr,pred[,c("upr")],col="red",lty=2)

lines(nd$wingchordr,conf[,c("lwr")],col="red",lty=1) #Plot the confidence intervals, which show where the mean may be.

lines(nd$wingchordr,conf[,c("upr")],col="red",lty=1)

lines(nd$wingchordr,yhat, col="blue", lwd=2) #The fit line only for your range of data.

#Or add a shaded fill if you prefer (as shown last week in lecture)

polygon(x=c(nd$wingchordr, rev(nd$wingchordr)),

y=c(conf[,c("lwr")],rev(conf[,c("upr")])),

col=adjustcolor("gray",alpha.f=0.2),

border=NA)

#Code from http://stackoverflow.com/questions/12544090/predict-lm-in-r-how-to-get-confidence-intervals-right

#Differences between confidence intervals (around mean) and prediction intervals (where next data point is likely from):

#http://blog.minitab.com/blog/adventures-in-statistics/when-should-i-use-confidence-intervals-prediction-intervals-and-tolerance-intervals

#http://www.graphpad.com/support/faqid/1506/

#####################

#Multiple regression#

#####################

regression.cwh<-lm(crestlength~wingchordr+indexsum, #simply add the additional variable.

data=titmouse)

summary(regression.cwh,

correlation=TRUE) #Show the correlation of the coefficients.

#Each variable means the effect of that variable holding the remaining variables constant.

plot(regression.cwh) #Get diagnostics again!

#Interactions in multiple regression

#Each non-interacting parameter now means the effect of that variable at X=0 (only at the intercept).

regression.cwhi1<-lm(crestlength~wingchordr*indexsum,

data=titmouse)

summary(regression.cwhi1,

correlation=TRUE)

#A different notation with identical results.

regression.cwhi2<-lm(crestlength~wingchordr+indexsum+wingchordr:indexsum,

data=titmouse)

summary(regression.cwhi2,

correlation=TRUE) #correlation=TRUE returns the correlation matrix of parameters.

plot(regression.cwhi2) #Check the diagnostic plots.

#To get a simple 3D plot, call the 'lattice' package.

library(lattice)

#Save the defaults so it doesn't mess up your plot settings later.

pardefault <- par(no.readonly = T)

cloud(crestlength~wingchordr*indexsum,

data=titmouse.na.omit)

#It's kinda hard to tell where the points are though.

library(scatterplot3d)

#Get a plot with points and lines connecting to base grid.

s3d <-scatterplot3d(x=titmouse.na.omit$wingchordr,

y=titmouse.na.omit$indexsum,

z=titmouse.na.omit$crestlength,

xlab="Wing chord (right) (mm)",

ylab="Hybrid index",

zlab="Crest length (mm)",

pch=16,

highlight.3d=TRUE,

type="h",

main="3D Scatterplot")

s3d$plane3d(regression.cwh) #Fits a linear plane over the plot, does not use the interaction.

#There are lots of ways to fit 3d surfaces.

#Additional plotting methods can be found here:

#http://www.statmethods.net/graphs/scatterplot.html

#Plot interactions with lines.

par(pardefault) #scatterplot 3D messes up something so we call pardefault back.

plot(crestlength~wingchordr,

data=titmouse.na.omit[titmouse.na.omit$indexsum==0,],

xlab="Wing chord (right) (mm)",

ylab="Crest length (mm)",

xlim=c(min(titmouse.na.omit$wingchordr),#These next four lines ensure no points get left off later.

max(titmouse.na.omit$wingchordr)),

ylim=c(min(titmouse.na.omit$crestlength),

max(titmouse.na.omit$crestlength)))

#Learn how to loop.

for (i in 1:6 ) { #i is what goes in the loop. The series of numbers are your values.

points(crestlength~wingchordr, #Put your functions/commands inside the brackets.

data=titmouse.na.omit[titmouse.na.omit$indexsum==i,], #Note that i goes in place of the number (0 in plot)

pch=21,

bg=i+1) #Another i here.

}

#The colors are funky though.

palette() #Check the color list.

default.colors<-palette() #Save the default palette.

(gray<-gray.colors(7, start=1, end=0)) #Create a gray scale.

#creates standard HYBRID INDEX Gray palette that has white (1) through black (7)

#for colors. Can be used to label figures with indexsum+1 (bg/col won't read 0).

gray.palette<-palette(gray) #Set the palette to gray and save it as an option like you did for default.

palette() #shows palette color names.

#Run the loop again and see how much easier it is to read.

for (i in 1:6 ) { #i is what goes in the loop. The series of numbers are your values.

points(crestlength~wingchordr, #Put your functions/commands inside the brackets.

data=titmouse.na.omit[titmouse.na.omit$indexsum==i,], #Note that i goes in place of the number (0 in plot)

pch=21,

bg=i+1) #You use +1 because 0-6 is the hybrid index but 1-7 matches the gray palette we made.

}

#reset back to default if desired.

palette(default.colors)

palette()

#Add a legend.

legend(title="Hybrid index values",

title.adj=0,

x=69.2, y=17.5, #Where the upper right corner of the legend is located on the coordinates of the plot.

ncol=7, #Number of columns. ncol=7 makes the legend horizontal, essentially.

xjust=0, bty="n", #How the legend is justified (left, right, center). Check ?legend for details. bty= box type. "n"= none.

x.intersp=0.75, y.intersp=1, #Character and line spacing.

text.width=0.5, #Width of legend text.

legend = c(0,1,2,3,4,5,6), #Values in the legend. Can be text if in quotes.

pt.lwd=2, #Line width around each point.

pt.bg=gray, #Background color for each point, here taken from the gray object (a set of color values).

pch=c(21,21,21,21,21,21), #Each of the seven points should be type 21 (which has a background (bg) fill)

cex=0.8, pt.cex=1.25) #Adjust the size of the text and points relative to the rest of the plot. Cex means character expansion factor.

#Adjust legend location as needed for your screen size or plot area.

########Adding lines for the interactions

#To plot the lines, for each value of the other variable, we need to use predict.lm()

#for each value of indexsum (0-6).

#We'll make a loop that assigns new predictive dataframes to items 1-7 in a list

#(equivalent to index 0-6, 7 items, but the list can't have a 0th item).

#Then it takes those values (if you look in the list the range of wingchordr

#is always the same, indexsum=the current value of interest).

#Then we use predict.lm to give us predicted values at each desired value of indexsum.

#Create an empty list to store values.

ndm<-list()

#Loop through the fit code.

for (i in 1:7){

ndm[[i]]<- data.frame("wingchordr"=seq(min(titmouse.na.omit$wingchordr),

max(titmouse.na.omit$wingchordr),

length.out=length(titmouse.na.omit$wingchordr)),

"indexsum"=i-1) #indexsum values go 0-6 but the list can't have a 0th item, only 1st-7th.

lines(predict.lm(regression.cwhi1, newdata=ndm[[i]]), col="black")

}

#Add some labels showing that the top regression line is for indexsum=6

#and the bottom line is indexsum=0

text("Black-crested (6)", x=85, y=21.5, cex=0.5)

text("Tufted (0)", x=85, y=18, cex=0.5)

#You can see in the plot that there are no interactions, which matches with

#the summary statistics... in this model, there was no significant effect of wingchordr

#or indexsum on crestlength, nor was the interaction significant.

#You can predict individual points as well.

#In class we used an Excel spreadsheet to manually calculate the Y (crestlength) values.

#You can create one yourself with the intercept and slope values and interactions to see get the same values.

#The following R code shows that manual calculations match up with what R is doing.

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=70,"indexsum"=0))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=70,"indexsum"=6))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=80,"indexsum"=0))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=80,"indexsum"=6))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=90,"indexsum"=0))

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=90,"indexsum"=6))

#Yup, the values match (ignoring rounding errors)!

#Lots of additional ways (using base R and more packages) to look at your multiple regression:

#http://colbyimaging.duckdns.org:8080/wiki/statistics/multiple-regression

#########################################

#Other variations on regression are major axis (MA) and

#reduced major axis (RMA) regressions, and loess smoothing.

#Loess smoothing

line.loess<-loess(titmouse.na.omit$billlength~titmouse.na.omit$wingchordr,

span=0.75, #degree of smoothing

family="gaussian", #fitting by least squares

degree=2) #polynomial degree, usually 1 or 2.

#Plot the points and smoothing line.

order.loess<-order(titmouse.na.omit$wingchordr) #orders lines so curve is smooth

plot(billlength~wingchordr,

data=titmouse.na.omit,

xlab="Wing chord (right) (mm)",

ylab="Bill length (mm)")

lines(titmouse.na.omit$wingchordr[order.loess],line.loess$fitted[order.loess]) #Plot the loess smooth on your scatterplot.

#For more information on setting parameters and appropriate use of loess curves, see

#CLEVELAND, W. S. 1979. Robust locally weighted regression and smoothing scatterplots. J. Am. Stat. Assoc., 74:829-836.

#For other types of regression, check out the 'lmodel2' package and set regression type via "method=" argument.

#Be careful of the names of the regressions as they vary between books;

# look at the details to make sure you get the

#type of regression you want.

#Finally, you can use ANOVA for linear regression (with all continuous variables).

anova(regression.cwh)

#Since we use type I here, each term adds in sequentially.

#The model improves significantly when we add in hybrid index.

#This makes sense when we get differences by species in our regular ANOVA next week!

Subscribe to:
Posts (Atom)