## Monday, February 23, 2015

### Beginning R: exploratory data analysis (includes basic plotting and testing for normality)

This continues my posts from co-teaching quantitative methods at the University of Manitoba.
#Start by setting your workspace and bringing in your data,
#and checking to make sure the variables are imported correctly
#with numbers as numeric, categories as factors, etc.
#Or use the menu...
#Session>Set Working Directory>To Source File Location [or you can choose the directory]

str(titmouse)
#shows the data types; always do this check.

#############################
##Exploratory data analysis##
#############################
#First, eyeball for weird values
View(titmouse) #You can also click on the dataframe in the environment panel.
#Check to see that all values filled correctly.
#Missing data: NA vs. NaN vs. Inf.
?NA #Missing data
?NaN #'Not a number', more complex.  Help file discusses +/- Inf as well.
#Inf usually encountered in calculations (such as for log transformations).

hist(titmouse) #Can't just use this plotting function on a data frame.
#Use titmouse\$[columnyouwant] to plot an individual variable.
hist(titmouse\$crestlength) #This gives the default axes and title.
hist(titmouse\$crestlength,
xlab="Crest length (mm)",
#label the x-axis with units
main="") #Empty the main title
hist(titmouse[titmouse\$indexsum==0,"crestlength"], #select rows where indexsum=0 and column is crestlength
xlab="Crest length (mm)",
main="Tufted Titmouse")

par(mfrow=c(1,3))
#par does lots of things to the plotting region.
#Here we tell it to do one row and three columns.
hist(titmouse[titmouse\$indexsum==0,"crestlength"], #select rows where indexsum=0 and column is crestlength
xlab="Crest length (mm)",
main="Tufted Titmouse")
hist(titmouse[titmouse\$indexsum>0&titmouse\$indexsum<6,"crestlength"],
#select rows where indexsum=0 and column is crestlength
xlab="Crest length (mm)",
main="Hybrid titmouse")
hist(titmouse[titmouse\$indexsum==6,"crestlength"],
#select rows where indexsum=0 and column is crestlength
xlab="Crest length (mm)",
main="Black-crested Titmouse")
par(mfrow=c(1,1))
#set it back to a 1 row, 1 column plot.

#Boxplots
boxplot(titmouse\$crestlength) #no axes!  This is terrible!
boxplot(titmouse\$crestlength,
ylab="Crest length (mm)",
xlab="All the titmice")
par(mfrow=c(2,1))
#two rows, one column for the plots.
boxplot(titmouse[titmouse\$indexsum==0,"crestlength"])
boxplot(titmouse[titmouse\$indexsum==6,"crestlength"])

#looks funky.
par(mfrow=c(1,2))
boxplot(titmouse[titmouse\$indexsum==0,"crestlength"],
xlab="Tufted Titmouse",
ylab="Crest length (mm)")
boxplot(titmouse[titmouse\$indexsum==6,"crestlength"],
xlab="Black-crested Titmouse",
ylab="Crest length (mm)",
lty="solid")

#Much nicer!
par(mfrow=c(1,1)) #Reset par back to one column, one row plotting region.
#Let's do some categories like last time.
titmouse\$species<-titmouse\$indexsum # create a new column with indexsum data copied.
#Always copy your column or dataframe when doing something big like this.
#While it won't mess up your original file (this is only in your R session),
#it will save you trouble from having to reimport your file if you mess things up.
titmouse\$species<-sub(0,"Tufted",titmouse\$species)
titmouse\$species<-sub(6,"Black-crested",titmouse\$species)
titmouse[titmouse\$species>0&titmouse\$species<6,]\$species = "hybrid"

plot(y=titmouse\$crestlength, x=titmouse\$species)
#Errors!  Why?
#It doesn't like x as "chr" or character.  You have to have it as a factor.
plot(y=titmouse\$crestlength, x=titmouse\$state) #Test it with a factor.
titmouse\$species<-as.factor(titmouse\$species) #Convert desired variable to a factor.
plot(y=titmouse\$crestlength, x=titmouse\$species) #It works, and plot also makes boxplots.
plot(titmouse\$crestlength~titmouse\$species) #This does the same thing and introduces R's formula notation, which you will need for many statistical functions.

#Scatterplots.  Plot is very flexible and also does scatterplots if your two variables are continuous.
plot(x=titmouse\$taillength, y=titmouse\$crestlength)
plot(titmouse\$crestlength~titmouse\$taillength)
#same as previous, formula notation.
plot(crestlength~taillength, data=titmouse) #same as previous, but referencing the dataframe in the data argument.
#(Each bit between the commas is an "argument" in the function.  So this is a plot function,
#with a formula argument, and a data argument.  We can add the axis label arguments too.

#So, make it have nice axis labels.
plot(crestlength~taillength,
data=titmouse,
xlab="Tail length (mm)",
ylab="Crest length (mm)")

#Get subsets of data.
plot(crestlength~taillength,
data=titmouse[titmouse\$species=="Tufted",],
xlab="Tail length (mm)",
ylab="Crest length (mm)")

#You can combine or elaborate these plots, too, by adding points.
points(crestlength~taillength,
data=titmouse[titmouse\$species=="Black-crested",],
xlab="Tail length (mm)",
ylab="Crest length (mm)",

pch=21, #choose the type of point.  ?pch shows all the possible values.
bg="black") #choose the fill color.

#Default R colors are shown in a pdf here.
colours()
colors()
#These will list off the names (also shown in the pdf).
#There are also ways to set exact colors.

points(crestlength~taillength,
data=titmouse[titmouse\$species=="hybrid",],
xlab="Tail length (mm)",
ylab="Crest length (mm)",
pch=2, #Another example of a symbol
col=rgb(red=200,green=30,blue=30,
maxColorValue=255))
#example of using rgb() function to generate color.
?par #"Color Specification" (near the end of the file) discusses other ways change color,
#including default and custom palettes.

#Draw some arbitrary lines in an empty plot.
plot(x=0,
y=0,
type="n",
#no symbols drawn
pch=NULL,
xlim=c(0,1),
ylim=c(0,1))

#These are arbitrary values to demonstrate line drawing capabilities.
abline(h=0.5, v=0.25) #give horizontal line or vertical line, can do these separately as well.
abline(a=0, b=0.25,
lty="dashed")

#give slope and intercept, in the formula for a straight line: y=bx+a.
#Then give the line type (lty=).  ?par lists the possible types.

#How to combine or elaborate these plots, say to get the boxplot with the histogram in the textbook.
#This code is adapted from here.

#Here we use par() to adjust the plot again, but remember to save the defaults first.
pardefault <- par(no.readonly = T)
#From this link (which contains other variations as well)
par(fig=c(0,0.8,0,0.8)) #Here par is used to adjust the plot area.
hist(titmouse\$crestlength,
xlab="Crest length (mm)",
main="")
#an empty main title
par(fig=c(0,0.8,0.55,1), new=TRUE)
boxplot(titmouse\$crestlength,
horizontal=TRUE,
axes=FALSE)

#Add to a scatterplot
par(fig=c(0,0.8,0,0.8))
plot(crestlength~taillength,
data=titmouse,
xlab="Tail length (mm)",
ylab="Crest length (mm)",
pch=21,
bg="gray",
col="black")
par(fig=c(0.65,1,0,0.8),new=TRUE)
boxplot(titmouse\$crestlength, axes=FALSE)
par(fig=c(0,0.8,0.55,1), new=TRUE)
boxplot(titmouse\$taillength,
horizontal=TRUE,
axes=FALSE)
par(pardefault)
#reset the default plotting parameters.

#############################
####Graphical diagnostics####
#############################
#Residuals, influence, outliers, and Q-Q plots.

#To get some residuals we'll run a simple linear regression model.
bill.wing.reg<-lm(titmouse\$billlength~titmouse\$wingchordr)
#Note the use of the formula here again... y~x.
summary(bill.wing.reg) #Get some basic data from it.
plot(bill.wing.reg)
#look at residuals vs. predicted ("Fitted") values for homoscedascity.
#The first plot puts row numbers or label numbers on outliers.
#The second plot is the Q-Q plot.
#The third is standardized residuals (not discussed in lecture).
#The fourth plots Cook's distance.  Dashed red lines show the boundary of 0.5 Cook's D.

#Get residuals to do additional plots.
rawres<-residuals(bill.wing.reg) #raw residuals
stres<-rstudent(bill.wing.reg) #studentized residuals

#Plot residual vs. independent variable.
plot(rawres~na.omit(titmouse\$wingchordr))
#we use na.omit because lm automatically removed NAs.

#Plot residuals in a histogram.
hist(rawres)

#Plot the data with the regression line.
plot(titmouse\$billlength~titmouse\$wingchordr)
abline(bill.wing.reg)

#Cook's distance
#Help says, "Cases which are influential with
#respect to any of these measures are marked with an asterisk."
im<-influence.measures(bill.wing.reg) #Cook's D is cook.d column.
str(im) #new format... a list.
im #lists are accessed via the level of the list.
im #is it an outlier by the measures in the table?
cookd<-im[][,"cook.d"] #get column "cook.d" from the first item in the list.
plot(cookd, type="h",
main="Observations",
ylab="Cook's D",
xlab="Index (=row number)")
abline(h=4/length(cookd), col="gray")

#This gets us a plot of Cook's D similar to the one SAS generates (shown in Nicky's lecture).

#Q-Q plot for just a raw variable.
qqnorm(titmouse\$billlength)
qqline(titmouse\$billlength)

#If you want to transform data,
#here's how (though none of our example data need it).
#Create a new column with an appropriate name.
titmouse\$lncrestlength<-log(titmouse\$crestlength)
#BEWARE!  R uses log() for the natural logarithm.  You must use log10() if you want base 10 logarithms.

#If you have a variable with a zero, you need to add a small constant first.
(titmouse\$lnindexsum<-log(titmouse\$indexsum+0.001))
#If you don't, you'll get Inf for any zero values.
(log(titmouse\$indexsum))
#We'll go into more transformations in a later lab.

#############################
####Tests for normality, ####
####skewness, and kurtosis###
#############################
#Time to install a package.
#Click on "Packages" tab and then "Install".
#Install package "moments" from the CRAN repository (will only work with internet connection).
library(moments) #invoke the library

skewness(titmouse\$crestlength, na.rm=TRUE)
kurtosis(titmouse\$crestlength, na.rm=TRUE)
shapiro.test(titmouse\$crestlength)

library(e1071)

#An error message about masking similarly named functions appears.
#This tells us we know that we are using e1071's functions now.
kurtosis(titmouse\$crestlength,
na.rm=TRUE,
type=2)
skewness(titmouse\$crestlength,
na.rm=TRUE,
type=2)

#This one provides more options, gives the algorithms in help,
#and can be compared with SAS.  Let's use this library and remove the 'moments' library.
detach("package:moments")

#Now make some plots
hist(titmouse\$crestlength, prob=TRUE,
xlab="Crest length for titmice",
ylim=c(0,0.3))
lines(density(na.omit(titmouse\$crestlength)))

#Add the normal distribution for your mean/sd.
curve(dnorm(x,
mean=mean(titmouse\$crestlength, na.rm=TRUE),
sd=sd(titmouse\$crestlength, na.rm=TRUE)),
lty="dashed",
col="red",

#For the residuals
skewness(stres, type=2) #Let's use the studentized residuals for fun.
kurtosis(stres, type=2) #make sure type=2 if you want to compare to SAS.
shapiro.test(stres)
hist(stres, prob=TRUE,
breaks=10,
#change the number of histogram bars if you want, see how it changes the graph
main="Studentized residuals",
xlab="Wing vs. bill residuals",
ylim=c(0,0.5))
#This lengthens the y-axis so our density lines won't get cut off in next step.
lines(density(stres)) #Adds the kernel density function of your data.

#Add the normal distribution for your mean/sd.
curve(dnorm(x,
mean=mean(stres),
sd=sd(stres)),
lty="dashed",
col="red",

##############################
###Testing for collinearity###
######(i.e. correlation)######
##############################

#How to test for correlation in the data.
cor(x=titmouse\$crestlength,
y=titmouse\$indexsum,
use="pairwise.complete.obs",
#another different way of getting rid of NAs
method="pearson")
cor.test(x=titmouse\$crestlength,
y=titmouse\$indexsum,
na.action=na.rm,
method="pearson")
#other methods are available, see the help file.

#Correlate more than one at a time, though you only get the correlation coefficient.
cor(titmouse[,c("wingchordr","taillength","crestlength","billlength")],
use="pairwise.complete.obs")

##############################
##############################

#Can use RStudio's export feature.
#Easier for reproducibility to just directly save into preferred format.
#Set width, height, etc. FOR ANY PLOT THAT YOU MAY HAVE TO RE-DO!
#Examples: publication, dissertation, thesis, reports, anything where having
#similarly formatted figures are important
#(so later fixes don't show up as different).

#I used the instructions from a previous blog post, so I won't repeat that here.