Tuesday, March 03, 2015

Beginning R: linear models part 1 of 2 (simple and multiple regression) and looping

More code from the University of Manitoba class in quantitative methods that I'm co-teaching with my post-doc mentor, Dr. Nicola Koper.

#Start by setting your workspace and bringing in your data,
#and checking to make sure the variables are imported correctly
#with numbers as numeric, categories as factors, etc.
setwd("C:\\Users\\Username\\Dropbox\\")
#Your directory here.
#Or use the menu...
#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Import your data.
titmouse<- read.table(file="blog_banding_data.csv", header=TRUE, sep=",")
str(titmouse)
#shows the data types; always do this check.
titmouse$species<-titmouse$indexsum # create a new column with indexsum data copied.
#Create a categorical species variable.
titmouse$species<-sub(0,"Tufted",titmouse$species)
titmouse$species<-sub(6,"Black-crested",titmouse$species)
titmouse[titmouse$species>0&titmouse$species<6,]$species = "hybrid"


#We need to omit the NA values for the dataEllipse function to work well and for simplifying various other plots and analyses,
#so create a new dataframe with all the variables that we'll use from titmouse.
titmouse.na.omit<-data.frame(na.omit(titmouse[,c("wingchordr",
                                                 "taillength",
                                                 "crestlength",
                                                 "billlength",
                                                 "indexsum",
                                                 "species",
                                                 "state",
                                                 "zone")]))



#############################
##Correlation and covariance#
#############################

#How to test for correlation in the data.
cor(x=titmouse.na.omit$crestlength,
    y=titmouse.na.omit$indexsum,
    use="pairwise.complete.obs",
#Use options are given in ?cor and specifies what to do with missing values (NA).  Here we're using the na.omit data so it's irrelevant.
    method="pearson")
cor.test(x=titmouse.na.omit$crestlength,
         y=titmouse.na.omit$indexsum,
         na.action=na.rm,
#Just to remind you it's here, titmouse.na.omit has no NAs.
         method="pearson") #other methods are available, see the help file and examples below.

#Correlate more than one at a time, though you only get the correlation coefficient.
cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],
    use="pairwise.complete.obs",
  #Use options are given in ?cor and specifies what to do with missing values.
    method="spearman")   #Example of another correlation method, this one non-parametric.

cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],
    use="pairwise.complete.obs",
#Use options are given in ?cor and specifies what to do with missing values.
    method="pearson")


plot(wingchordr~taillength,
     data=titmouse,
     xlab="Tail length (mm)",
     ylab="Wing chord (right) (mm)")

#Because correlation does not imply indepedence and dependence,
#it doesn't matter which is x and y when plotted.
plot(taillength~wingchordr,
     xlab="Wing chord (right) (mm)",
     ylab="Tail length (mm)",
     data=titmouse)


#Make a plot with data ellipses.

library(car) #Call the 'car' library that contains the dataEllipse() function.

#From help: "dataEllipse superimposes the normal-probability contours over a scatterplot of the data."
coords.ellipse<-dataEllipse(
            x=titmouse.na.omit$wingchordr,
            xlab="Wing chord (right) (mm)",
            y=titmouse.na.omit$taillength,
            ylab="Tail length (mm)",
            levels=c(0.68, 0.95, 0.975),
#levels are quantiles.  0.68 is about one standard deviation assuming normality.
            lty=2, #Dashed lines
            lwd=1, #width=1
            fill=TRUE, fill.alpha=0.1, #Fill ellipses with transparent color.
            grid=FALSE, #No background grid.
            draw=TRUE, #change to TRUE to show on plot.
            plot.points=FALSE, #Add the points back over the plot.
            center.pch=FALSE, #No centroid point shown.
            add=FALSE, #Don't add to the previous plot we made, which were the points for the correlation.
            col="black", #black borders
            pch=21, #filled points
            bg="black") #black fill

#Or, fill the ellipses manually with polygon().

plot(taillength~wingchordr,
     data=titmouse,
     xlab="Wing chord (right) (mm)",
     ylab="Tail length (mm)")

polygon(x=coords.ellipse[[1]][,"x"],
#Get the ellipse coordinates from the object we created previously.
        y=coords.ellipse[[1]][,"y"],
        col=adjustcolor("darkgray",alpha.f=0.9),
                        border=NA)
polygon(x=coords.ellipse[[2]][,"x"],
        y=coords.ellipse[[2]][,"y"],
        col=adjustcolor("gray",alpha.f=0.8),
        border=NA)
polygon(x=coords.ellipse[[3]][,"x"],
        y=coords.ellipse[[3]][,"y"],
        col=adjustcolor("gray",alpha.f=0.3),
        border=NA)

#Add points back over the polygons.
points(taillength~wingchordr,
       data=titmouse,
       pch=21,
       bg="black")


#############################
##Linear models: regression##
#############################
#R's linear regression is based on the lm() function.  This is the general linear model.
#The glm() function, confusingly, is the GENERALIZED linear model
#(which we'll use later in the semester; it allows for use of non-normal distributions).
#So, this time we only use lm().

regression.bw<-lm(crestlength~wingchordr,
                  data=titmouse.na.omit)

#Use this notation with clean variable names and the data argument,
#as it inexplicably messes up confidence limit predictions later if you don't.  Don't ask me why!

summary(regression.bw)
#Slope=-0.10926, i.e. 0.10926 decrease in crestlength for each unit of wing chord.
#Slope from lecture: "beta = "value" = parameter estimate = slope"
##Multiple R-squared, from lecture: "How much of the variance in our model is explained by our independent variables?"
#intercept is where x=0 as you can see from the plot below.
plot(crestlength~wingchordr,
         data=titmouse,
         xlim=c(0,90),
         ylim=c(0,50),
         xlab="Crest length (mm)",
         ylab="Wing chord (right) (mm)")
abline(regression.bw, v=0)
text(x=10,
     y=regression.bw$coef[1]+5,
     labels=c("Intercept"))
#Add some text nearby the intercept to point it out.

#Getting data from your regression model.
coef(regression.bw) #only extracts the coefficients B0 and B1 (intercept and slope).
fitted.values(regression.bw) #extract the fitted values.

#Reminder of how to get diagnostic plots:
plot(regression.bw)
#And here's how to get out residuals if you want to analyze them beyond the default diagnostic plots"
residuals(regression.bw) #Raw residuals
rstudent(regression.bw) #Studentized residuals
#Analyze your diagnostics following the lab from 27 Jan. 2015.

#Plotting and confidence intervals
#Get confidence intervals for parameters.
confint(regression.bw, level=0.95) #for parameter confidence intervals, change % with level.

#Estimate confidence and prediction values for a series of evenly spaced new x values
nd<- data.frame("wingchordr"=seq(min(titmouse.na.omit$wingchordr),
         max(titmouse.na.omit$wingchordr),               
#Range min and max of your values using seq()
         length.out=length(titmouse.na.omit$wingchordr))) #same length as original data
conf <- predict(regression.bw,interval="confidence",newdata=nd) #Use the predict function to get confidence intervals
pred <- predict(regression.bw,interval="prediction",newdata=nd) #Use the predict function to get prediction intervals
yhat <- predict(regression.bw,newdata=nd) #Use the predict function to get predicted values

#Plotting everything together:
plot(crestlength~wingchordr,
     xlab="Wing chord (right) (mm)",
     ylab="Crest length (mm)",
     data=titmouse.na.omit)
## data
abline(regression.bw) ## fit
lines(nd$wingchordr,pred[,c("lwr")],col="red",lty=2)  #Plot the prediction intervals, which predicts where new data may be sampled.
lines(nd$wingchordr,pred[,c("upr")],col="red",lty=2) 
lines(nd$wingchordr,conf[,c("lwr")],col="red",lty=1) 
#Plot the confidence intervals, which show where the mean may be.
lines(nd$wingchordr,conf[,c("upr")],col="red",lty=1) 
lines(nd$wingchordr,yhat, col="blue", lwd=2)
#The fit line only for your range of data.

#Or add a shaded fill if you prefer (as shown last week in lecture)
polygon(x=c(nd$wingchordr, rev(nd$wingchordr)),
        y=c(conf[,c("lwr")],rev(conf[,c("upr")])),
        col=adjustcolor("gray",alpha.f=0.2),
        border=NA)

#Code from http://stackoverflow.com/questions/12544090/predict-lm-in-r-how-to-get-confidence-intervals-right
#Differences between confidence intervals (around mean) and prediction intervals (where next data point is likely from):
#http://blog.minitab.com/blog/adventures-in-statistics/when-should-i-use-confidence-intervals-prediction-intervals-and-tolerance-intervals
#http://www.graphpad.com/support/faqid/1506/


#####################
#Multiple regression#
#####################

regression.cwh<-lm(crestlength~wingchordr+indexsum, #simply add the additional variable.
                   data=titmouse)
summary(regression.cwh,
        correlation=TRUE)
#Show the correlation of the coefficients.
#Each variable means the effect of that variable holding the remaining variables constant.
plot(regression.cwh) #Get diagnostics again!

#Interactions in multiple regression
#Each non-interacting parameter now means the effect of that variable at X=0 (only at the intercept).
regression.cwhi1<-lm(crestlength~wingchordr*indexsum,
                    data=titmouse)
summary(regression.cwhi1,
        correlation=TRUE)

#A different notation with identical results.
regression.cwhi2<-lm(crestlength~wingchordr+indexsum+wingchordr:indexsum,
                     data=titmouse)
summary(regression.cwhi2,
        correlation=TRUE)
#correlation=TRUE returns the correlation matrix of parameters.
plot(regression.cwhi2) #Check the diagnostic plots.

#To get a simple 3D plot, call the 'lattice' package.
library(lattice)
#Save the defaults so it doesn't mess up your plot settings later.
pardefault <- par(no.readonly = T)

cloud(crestlength~wingchordr*indexsum,
      data=titmouse.na.omit)

#It's kinda hard to tell where the points are though.
library(scatterplot3d)
#Get a plot with points and lines connecting to base grid.
s3d <-scatterplot3d(x=titmouse.na.omit$wingchordr,
                    y=titmouse.na.omit$indexsum,
                    z=titmouse.na.omit$crestlength,
                    xlab="Wing chord (right) (mm)",
                    ylab="Hybrid index",
                    zlab="Crest length (mm)",
                    pch=16,
                    highlight.3d=TRUE,
                    type="h",
                    main="3D Scatterplot")
s3d$plane3d(regression.cwh)
#Fits a linear plane over the plot, does not use the interaction.

#There are lots of ways to fit 3d surfaces.
#Additional plotting methods can be found here:
#http://www.statmethods.net/graphs/scatterplot.html

#Plot interactions with lines.
par(pardefault) #scatterplot 3D messes up something so we call pardefault back.

plot(crestlength~wingchordr,
     data=titmouse.na.omit[titmouse.na.omit$indexsum==0,],
     xlab="Wing chord (right) (mm)",
     ylab="Crest length (mm)",
     xlim=c(min(titmouse.na.omit$wingchordr),
#These next four lines ensure no points get left off later.
           max(titmouse.na.omit$wingchordr)),
     ylim=c(min(titmouse.na.omit$crestlength),
           max(titmouse.na.omit$crestlength)))

#Learn how to loop.
for (i in 1:6 ) {                           #i is what goes in the loop.  The series of numbers are your values.
  points(crestlength~wingchordr,            #Put your functions/commands inside the brackets.
         data=titmouse.na.omit[titmouse.na.omit$indexsum==i,],  #Note that i goes in place of the number (0 in plot)
         pch=21,
         bg=i+1) 
#Another i here.
}
#The colors are funky though.
palette() #Check the color list.
default.colors<-palette() #Save the default palette.

(gray<-gray.colors(7, start=1, end=0)) #Create a gray scale.
#creates standard HYBRID INDEX Gray palette that has white (1) through black (7)
#for colors.  Can be used to label figures with indexsum+1 (bg/col won't read 0).

gray.palette<-palette(gray)          #Set the palette to gray and save it as an option like you did for default.
palette() #shows palette color names.

#Run the loop again and see how much easier it is to read.
for (i in 1:6 ) {                           #i is what goes in the loop.  The series of numbers are your values.
  points(crestlength~wingchordr,            #Put your functions/commands inside the brackets.
         data=titmouse.na.omit[titmouse.na.omit$indexsum==i,],  #Note that i goes in place of the number (0 in plot)
         pch=21,
         bg=i+1) 
#You use +1 because 0-6 is the hybrid index but 1-7 matches the gray palette we made.
}

#reset back to default if desired.
palette(default.colors)
palette()

#Add a legend.
legend(title="Hybrid index values",
       title.adj=0,
       x=69.2, y=17.5,
#Where the upper right corner of the legend is located on the coordinates of the plot.
       ncol=7,         #Number of columns.  ncol=7 makes the legend horizontal, essentially.
       xjust=0, bty="n", #How the legend is justified (left, right, center).  Check ?legend for details.  bty= box type.  "n"= none.
       x.intersp=0.75, y.intersp=1,  #Character and line spacing.
       text.width=0.5, #Width of legend text.
       legend = c(0,1,2,3,4,5,6), #Values in the legend.  Can be text if in quotes.
       pt.lwd=2, #Line width around each point.
       pt.bg=gray, #Background color for each point, here taken from the gray object (a set of color values).
       pch=c(21,21,21,21,21,21), #Each of the seven points should be type 21 (which has a background (bg) fill)
       cex=0.8, pt.cex=1.25) #Adjust the size of the text and points relative to the rest of the plot.  Cex means character expansion factor.
#Adjust legend location as needed for your screen size or plot area.

########Adding lines for the interactions
#To plot the lines, for each value of the other variable, we need to use predict.lm()
#for each value of indexsum (0-6).
#We'll make a loop that assigns new predictive dataframes to items 1-7 in a list
#(equivalent to index 0-6, 7 items, but the list can't have a 0th item).
#Then it takes those values (if you look in the list the range of wingchordr
#is always the same, indexsum=the current value of interest).
#Then we use predict.lm to give us predicted values at each desired value of indexsum.
#Create an empty list to store values.
ndm<-list()
#Loop through the fit code.
for (i in 1:7){
 
  ndm[[i]]<- data.frame("wingchordr"=seq(min(titmouse.na.omit$wingchordr),
                                         max(titmouse.na.omit$wingchordr),
                                         length.out=length(titmouse.na.omit$wingchordr)),
                        "indexsum"=i-1)
#indexsum values go 0-6 but the list can't have a 0th item, only 1st-7th.
  lines(predict.lm(regression.cwhi1, newdata=ndm[[i]]), col="black") 
}

#Add some labels showing that the top regression line is for indexsum=6
#and the bottom line is indexsum=0

text("Black-crested (6)", x=85, y=21.5, cex=0.5)
text("Tufted (0)", x=85, y=18, cex=0.5)

#You can see in the plot that there are no interactions, which matches with
#the summary statistics... in this model, there was no significant effect of wingchordr
#or indexsum on crestlength, nor was the interaction significant.

#You can predict individual points as well.
#In class we used an Excel spreadsheet to manually calculate the Y (crestlength) values.
#You can create one yourself with the intercept and slope values and interactions to see get the same values.
#The following R code shows that manual calculations match up with what R is doing.

predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=70,"indexsum"=0))
predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=70,"indexsum"=6))
predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=80,"indexsum"=0))
predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=80,"indexsum"=6))
predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=90,"indexsum"=0))
predict.lm(regression.cwhi1, newdata=data.frame("wingchordr"=90,"indexsum"=6))
#Yup, the values match (ignoring rounding errors)!

#Lots of additional ways (using base R and more packages) to look at your multiple regression:
#http://colbyimaging.duckdns.org:8080/wiki/statistics/multiple-regression

#########################################
#Other variations on regression are major axis (MA) and
#reduced major axis (RMA) regressions, and loess smoothing.
#Loess smoothing
line.loess<-loess(titmouse.na.omit$billlength~titmouse.na.omit$wingchordr,
                  span=0.75, #degree of smoothing
                  family="gaussian", #fitting by least squares
                  degree=2) #polynomial degree, usually 1 or 2.

#Plot the points and smoothing line.
order.loess<-order(titmouse.na.omit$wingchordr) #orders lines so curve is smooth
plot(billlength~wingchordr,
     data=titmouse.na.omit,
     xlab="Wing chord (right) (mm)",
     ylab="Bill length (mm)")
lines(titmouse.na.omit$wingchordr[order.loess],line.loess$fitted[order.loess]) #Plot the loess smooth on your scatterplot.
#For more information on setting parameters and appropriate use of loess curves, see
#CLEVELAND, W. S. 1979. Robust locally weighted regression and smoothing scatterplots. J. Am. Stat. Assoc., 74:829-836.

#For other types of regression, check out the 'lmodel2' package and set regression type via "method=" argument.
#Be careful of the names of the regressions as they vary between books;
# look at the details to make sure you get the
#type of regression you want.


#Finally, you can use ANOVA for linear regression (with all continuous variables).
anova(regression.cwh)
#Since we use type I here, each term adds in sequentially.
#The model improves significantly when we add in hybrid index.
#This makes sense when we get differences by species in our regular ANOVA next week!

No comments:

Post a Comment

Comments and suggestions welcome.