Monday, February 23, 2015

Beginning R: exploratory data analysis (includes basic plotting and testing for normality)

This continues my posts from co-teaching quantitative methods at the University of Manitoba.
#Start by setting your workspace and bringing in your data,
#and checking to make sure the variables are imported correctly
#with numbers as numeric, categories as factors, etc.
setwd("C:\\Username\\Dropbox\\")
#Your directory here.
#Or use the menu...
#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Import your data.
titmouse<- read.table(file="blog_banding_data.csv", header=TRUE, sep=",")
str(titmouse)
#shows the data types; always do this check.


#############################
##Exploratory data analysis##
#############################
#First, eyeball for weird values
View(titmouse) #You can also click on the dataframe in the environment panel.
#Check to see that all values filled correctly.
#Missing data: NA vs. NaN vs. Inf.
?NA #Missing data
?NaN #'Not a number', more complex.  Help file discusses +/- Inf as well.
#Inf usually encountered in calculations (such as for log transformations).

hist(titmouse) #Can't just use this plotting function on a data frame.
#Use titmouse$[columnyouwant] to plot an individual variable.
hist(titmouse$crestlength) #This gives the default axes and title.
hist(titmouse$crestlength,
     xlab="Crest length (mm)",
#label the x-axis with units
     main="") #Empty the main title
hist(titmouse[titmouse$indexsum==0,"crestlength"], #select rows where indexsum=0 and column is crestlength
     xlab="Crest length (mm)",
     main="Tufted Titmouse")

par(mfrow=c(1,3))
#par does lots of things to the plotting region.
#Here we tell it to do one row and three columns.
hist(titmouse[titmouse$indexsum==0,"crestlength"], #select rows where indexsum=0 and column is crestlength
     xlab="Crest length (mm)",
     main="Tufted Titmouse")
hist(titmouse[titmouse$indexsum>0&titmouse$indexsum<6,"crestlength"],
#select rows where indexsum=0 and column is crestlength
     xlab="Crest length (mm)",
     main="Hybrid titmouse")
hist(titmouse[titmouse$indexsum==6,"crestlength"],
#select rows where indexsum=0 and column is crestlength
     xlab="Crest length (mm)",
     main="Black-crested Titmouse")
par(mfrow=c(1,1))
#set it back to a 1 row, 1 column plot.

#Boxplots
boxplot(titmouse$crestlength) #no axes!  This is terrible!
boxplot(titmouse$crestlength,
        ylab="Crest length (mm)",
        xlab="All the titmice")
par(mfrow=c(2,1))
#two rows, one column for the plots.
boxplot(titmouse[titmouse$indexsum==0,"crestlength"])
boxplot(titmouse[titmouse$indexsum==6,"crestlength"])

#looks funky.
par(mfrow=c(1,2))
boxplot(titmouse[titmouse$indexsum==0,"crestlength"],
        xlab="Tufted Titmouse",
        ylab="Crest length (mm)")
boxplot(titmouse[titmouse$indexsum==6,"crestlength"],
        xlab="Black-crested Titmouse",
        ylab="Crest length (mm)",
        lty="solid")

#Much nicer!
par(mfrow=c(1,1)) #Reset par back to one column, one row plotting region.
#Let's do some categories like last time.
titmouse$species<-titmouse$indexsum # create a new column with indexsum data copied.
#Always copy your column or dataframe when doing something big like this.
#While it won't mess up your original file (this is only in your R session),
#it will save you trouble from having to reimport your file if you mess things up.
titmouse$species<-sub(0,"Tufted",titmouse$species)
titmouse$species<-sub(6,"Black-crested",titmouse$species)
titmouse[titmouse$species>0&titmouse$species<6,]$species = "hybrid"


plot(y=titmouse$crestlength, x=titmouse$species)
#Errors!  Why?
#It doesn't like x as "chr" or character.  You have to have it as a factor.
plot(y=titmouse$crestlength, x=titmouse$state) #Test it with a factor.
titmouse$species<-as.factor(titmouse$species) #Convert desired variable to a factor.
plot(y=titmouse$crestlength, x=titmouse$species) #It works, and plot also makes boxplots.
plot(titmouse$crestlength~titmouse$species) #This does the same thing and introduces R's formula notation, which you will need for many statistical functions.

#Scatterplots.  Plot is very flexible and also does scatterplots if your two variables are continuous.
plot(x=titmouse$taillength, y=titmouse$crestlength)
plot(titmouse$crestlength~titmouse$taillength)
#same as previous, formula notation.
plot(crestlength~taillength, data=titmouse) #same as previous, but referencing the dataframe in the data argument.
#(Each bit between the commas is an "argument" in the function.  So this is a plot function,
#with a formula argument, and a data argument.  We can add the axis label arguments too.

#So, make it have nice axis labels.
plot(crestlength~taillength,
     data=titmouse,
     xlab="Tail length (mm)",
     ylab="Crest length (mm)")


#Get subsets of data.
plot(crestlength~taillength,
     data=titmouse[titmouse$species=="Tufted",],
     xlab="Tail length (mm)",
     ylab="Crest length (mm)")

#You can combine or elaborate these plots, too, by adding points.
points(crestlength~taillength,
     data=titmouse[titmouse$species=="Black-crested",],
     xlab="Tail length (mm)",
     ylab="Crest length (mm)",

     pch=21, #choose the type of point.  ?pch shows all the possible values.
     bg="black") #choose the fill color.

#Default R colors are shown in a pdf here.
colours()
colors()
#These will list off the names (also shown in the pdf).
#There are also ways to set exact colors.

points(crestlength~taillength,
       data=titmouse[titmouse$species=="hybrid",],
       xlab="Tail length (mm)",
       ylab="Crest length (mm)",
       pch=2, #Another example of a symbol
       col=rgb(red=200,green=30,blue=30,
               maxColorValue=255))
#example of using rgb() function to generate color.
?par #"Color Specification" (near the end of the file) discusses other ways change color,
#including default and custom palettes.

#Draw some arbitrary lines in an empty plot.
plot(x=0,
     y=0,
     type="n",
#no symbols drawn
     pch=NULL,
     xlim=c(0,1),
     ylim=c(0,1))

#These are arbitrary values to demonstrate line drawing capabilities.
abline(h=0.5, v=0.25) #give horizontal line or vertical line, can do these separately as well.
abline(a=0, b=0.25,
       lty="dashed")

#give slope and intercept, in the formula for a straight line: y=bx+a.
#Then give the line type (lty=).  ?par lists the possible types.

#How to combine or elaborate these plots, say to get the boxplot with the histogram in the textbook.
#This code is adapted from here.

#Here we use par() to adjust the plot again, but remember to save the defaults first.
pardefault <- par(no.readonly = T)
#From this link (which contains other variations as well)
par(fig=c(0,0.8,0,0.8)) #Here par is used to adjust the plot area.
hist(titmouse$crestlength,
     xlab="Crest length (mm)",
     main="")
#an empty main title
par(fig=c(0,0.8,0.55,1), new=TRUE)
boxplot(titmouse$crestlength,
        horizontal=TRUE,
        axes=FALSE)


#Add to a scatterplot
par(fig=c(0,0.8,0,0.8))
plot(crestlength~taillength,
     data=titmouse,
     xlab="Tail length (mm)",
     ylab="Crest length (mm)",
     pch=21,
     bg="gray",
     col="black")
par(fig=c(0.65,1,0,0.8),new=TRUE)
boxplot(titmouse$crestlength, axes=FALSE)
par(fig=c(0,0.8,0.55,1), new=TRUE)
boxplot(titmouse$taillength,
        horizontal=TRUE,
        axes=FALSE)
par(pardefault)
#reset the default plotting parameters.

#############################
####Graphical diagnostics####
#############################
#Residuals, influence, outliers, and Q-Q plots.

#To get some residuals we'll run a simple linear regression model.
bill.wing.reg<-lm(titmouse$billlength~titmouse$wingchordr)
#Note the use of the formula here again... y~x.
summary(bill.wing.reg) #Get some basic data from it. 
plot(bill.wing.reg)
#look at residuals vs. predicted ("Fitted") values for homoscedascity.
#The first plot puts row numbers or label numbers on outliers.
#The second plot is the Q-Q plot.
#The third is standardized residuals (not discussed in lecture).
#The fourth plots Cook's distance.  Dashed red lines show the boundary of 0.5 Cook's D.

#Get residuals to do additional plots.
rawres<-residuals(bill.wing.reg) #raw residuals
stres<-rstudent(bill.wing.reg) #studentized residuals

#Plot residual vs. independent variable.
plot(rawres~na.omit(titmouse$wingchordr))
#we use na.omit because lm automatically removed NAs.

#Plot residuals in a histogram.
hist(rawres)

#Plot the data with the regression line.
plot(titmouse$billlength~titmouse$wingchordr)
abline(bill.wing.reg)

#Cook's distance
#Help says, "Cases which are influential with
#respect to any of these measures are marked with an asterisk."
im<-influence.measures(bill.wing.reg) #Cook's D is cook.d column.
str(im) #new format... a list.
im[1] #lists are accessed via the level of the list.
im[2] #is it an outlier by the measures in the table?
cookd<-im[[1]][,"cook.d"] #get column "cook.d" from the first item in the list.
plot(cookd, type="h",
     main="Observations",
     ylab="Cook's D",
     xlab="Index (=row number)")
abline(h=4/length(cookd), col="gray")

#This gets us a plot of Cook's D similar to the one SAS generates (shown in Nicky's lecture).

#Q-Q plot for just a raw variable.
qqnorm(titmouse$billlength)
qqline(titmouse$billlength)

#If you want to transform data,
#here's how (though none of our example data need it).
#Create a new column with an appropriate name.
titmouse$lncrestlength<-log(titmouse$crestlength)
#BEWARE!  R uses log() for the natural logarithm.  You must use log10() if you want base 10 logarithms.

#If you have a variable with a zero, you need to add a small constant first.
(titmouse$lnindexsum<-log(titmouse$indexsum+0.001))
#If you don't, you'll get Inf for any zero values.
(log(titmouse$indexsum))
#We'll go into more transformations in a later lab.

#############################
####Tests for normality, ####
####skewness, and kurtosis###
#############################
#Time to install a package.
#Click on "Packages" tab and then "Install".
#Install package "moments" from the CRAN repository (will only work with internet connection).
library(moments) #invoke the library

skewness(titmouse$crestlength, na.rm=TRUE)
kurtosis(titmouse$crestlength, na.rm=TRUE)
shapiro.test(titmouse$crestlength)

library(e1071)

#An error message about masking similarly named functions appears.
#This tells us we know that we are using e1071's functions now.
kurtosis(titmouse$crestlength,
         na.rm=TRUE,
         type=2)
skewness(titmouse$crestlength,
         na.rm=TRUE,
         type=2)

#This one provides more options, gives the algorithms in help,
#and can be compared with SAS.  Let's use this library and remove the 'moments' library.
detach("package:moments")


#Now make some plots
hist(titmouse$crestlength, prob=TRUE,
     xlab="Crest length for titmice",
     ylim=c(0,0.3))
lines(density(na.omit(titmouse$crestlength)))

#Add the normal distribution for your mean/sd.
curve(dnorm(x,
            mean=mean(titmouse$crestlength, na.rm=TRUE),
            sd=sd(titmouse$crestlength, na.rm=TRUE)),
      lty="dashed",
      col="red",
      add=TRUE)



#For the residuals
skewness(stres, type=2) #Let's use the studentized residuals for fun.
kurtosis(stres, type=2) #make sure type=2 if you want to compare to SAS.
shapiro.test(stres)
hist(stres, prob=TRUE,
     breaks=10,
#change the number of histogram bars if you want, see how it changes the graph
     main="Studentized residuals",
     xlab="Wing vs. bill residuals",
     ylim=c(0,0.5))
#This lengthens the y-axis so our density lines won't get cut off in next step.
lines(density(stres)) #Adds the kernel density function of your data.

#Add the normal distribution for your mean/sd.
curve(dnorm(x,
         mean=mean(stres),
         sd=sd(stres)),
      lty="dashed",
      col="red",
      add=TRUE)


##############################
###Testing for collinearity###
######(i.e. correlation)######
##############################

#How to test for correlation in the data.
cor(x=titmouse$crestlength,
    y=titmouse$indexsum,
    use="pairwise.complete.obs",
#another different way of getting rid of NAs
    method="pearson")
cor.test(x=titmouse$crestlength,
         y=titmouse$indexsum,
         na.action=na.rm,
         method="pearson")
#other methods are available, see the help file.

#Correlate more than one at a time, though you only get the correlation coefficient.
cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],
    use="pairwise.complete.obs")



##############################
######Saving your plots#######
##############################

#Can use RStudio's export feature.
#Easier for reproducibility to just directly save into preferred format.
#Set width, height, etc. FOR ANY PLOT THAT YOU MAY HAVE TO RE-DO!
#Examples: publication, dissertation, thesis, reports, anything where having
#similarly formatted figures are important
#(so later fixes don't show up as different).

#I used the instructions from a previous blog post, so I won't repeat that here.

No comments:

Post a Comment

Comments and suggestions welcome.