## Tuesday, April 28, 2015

### Beginning R: selected multivariate statistics, part 3 of 3 (PCA)

This is the last multivariate code post from the quantitative methods class.  Next week will be the final set of code (on non-normal data) from that class, which I co-taught with Dr. Nicola Koper.

Get the data import code and data here.

######################################
####Principle Componenents Analysis###
######################################

#R has two methods for performing PCA.
#Generally they will give similar results and the differences involve the computation and a few summary functions.

pca.morphology<-prcomp(titmouse.na.omit[,1:7], #specifying columns by number can be dangerous if you change your data format.
scale=TRUE, #correlation matrix used instead of covariance matrix, which is only appropriate if everything in same units.
retx=TRUE) #required to get PC scores for each individual.

summary(pca.morphology)
(pca.eigenvalues<-pca.morphology\$sdev^2) #Get eigenvalues.
screeplot(pca.morphology) #Plot eigenvalues.
biplot(pca.morphology)

pca.morphology\$rotation #eigenvectors.  Again the signs are arbitrary so don't worry
#if they differ but absolute values are the same between different programs or versions of R.
#Pearson's correlation of components with original variables.  Easier to interpret.
#Eigenvectors are how you get PCs, so also a sort of weight, just harder to think about.
#If not obvious, probably not interpretable. Interpret loadings above |0.33|.
#PC1 has shorter wing and tail, smaller mass, shorter and shallower bills (size) and longer crest.
#PC1 explains 32.4% of variance.
#PC2 is longer wing and tail, shorter and shallower bills (shape) and explains 20.5% of variance.

#plotting PCA
defaultpalette<-palette() #save the default palette before we mess with it.
gray<-gray.colors(7, start=1, end=0)
palette(gray)

#creates standard hybrid index gray palette that has white (1) through black (7)
#for colors.  Can be used to label figures with indexsum+1.
palette() #shows palette hex codes

pscores<-data.frame(pca.morphology\$x)
#puts PCA scores in a data frame
pscores\$indexsum<-titmouse.na.omit\$indexsum # adds indexsum values to PC scores for individual birds
pscores\$indexcolor<-as.factor(pscores\$indexsum+1)
pscores\$zone<-as.factor(titmouse.na.omit\$zone)
# adds zone age
pscores\$zonenum<-as.factor(titmouse.na.omit\$zone) # makes zone age number column
pscores\$zonenum<- sub("old",21,pscores\$zonenum)
pscores\$zonenum<- sub("young",23,pscores\$zonenum)
pscores\$zonenum<-as.numeric(pscores\$zonenum)

str(pscores)

#plotting principal components
plot(x=pscores\$PC1,y=pscores\$PC2,
asp=1,
mar=c(2,2,2,2)+0.1,
bg=pscores\$indexcolor,
col=7,
pch=pscores\$zonenum,
cex=2,
lwd=2,
xlab="PC1 Increasing values: shorter wing and tail, shorter and shallower bill, less weight",
ylab="PC2 Increasing values: longer wing and tail, shorter and shallower bill",
cex.axis=1,
cex.lab=0.6)

legend(title="Hybrid index values",
x=-5.5,
y=3.76,
ncol=1,
xjust=0,
bty="n",
x.intersp=0.75,
y.intersp=1,
text.width=0.5,
legend = c(0,1,2,3,4,5,6),
pt.lwd=2,
pt.bg=gray,
pch=c(21,21,21,21,21,21),
cex=0.8,
pt.cex=1.25)

# specifies colors, labels for legend, and symbols, and location for legend and size (cex)
legend(title="Zone age",
x=-5.5,
y=0,
ncol=1,
bty="n",
x.intersp=1,
y.intersp=1,
text.width=0.5,
legend=c("Old","Young"),
pt.bg="gray",
pch=c(21,23),
cex=0.8,
pt.cex=1.25,
pt.lwd=2)

### add ellipses for old/young
#make some logical objects to keep your selections cleaner in the code below.
tuftedmo<-pscores\$indexsum<1 & pscores\$zone=='old'
bcmo<-pscores\$indexsum>5 & pscores\$zone=='old'
tuftedmy<-pscores\$indexsum<1 & pscores\$zone=='young'
bcmy<-pscores\$indexsum>5 & pscores\$zone=='young'

library(car)
#required for dataEllipse although we've called it earlier so it's unnecessary here.
dataEllipse(pscores[tuftedmo, ]\$PC1,
pscores[tuftedmo, ]\$PC2,
levels=0.68,
lty=1,
plot.points=FALSE,
center.pch=FALSE)
dataEllipse(pscores[bcmo, ]\$PC1,
pscores[bcmo, ]\$PC2,
levels=0.68,lty=1,
plot.points=FALSE,
col="black",
center.pch=FALSE)

dataEllipse(pscores[tuftedmy, ]\$PC1,
pscores[tuftedmy, ]\$PC2,
levels=0.68,
lty=3,
plot.points=FALSE,
col="gray",
center.pch=FALSE)
dataEllipse(pscores[bcmy, ]\$PC1,
pscores[bcmy, ]\$PC2,
levels=0.68,lty=3,
plot.points=FALSE,
col="black",
center.pch=FALSE)

#These give black lines for Black-crested Titmouse and gray lines for Tufted Titmouse.
#Birds from the young zone have diamond symbols and dotted lines,
#while birds from the older zone have solid lines and round symbols.

#Return palette() to default after using the gray palette for this plot.
palette(defaultpalette)
palette()