Tuesday, May 05, 2015

Beginning R: a very brief survey of methods for non-normal data

This was the last lab code for the quantitative methods class I co-taught with Dr. Nicola Koper this winter semester (i.e., the semester I used to know as "spring").

#Start by setting your workspace.  Even though we don't import a file this time,
#it's good practice so any figures you generate will go to the correct folder.
#Otherwise they tend to disappear into that totally obvious folder that you then forget.
######################################################################################
setwd("C:\\Users\\YourUsername\\")
#Your directory here.
#Or use the menu...
#Session>Set Working Directory>To Source File Location [or you can choose the directory]

#Today we'll use a sample dataset that's already in R,
#plus generate some random data of our own.
#No need for text files or importing!

#Generate some proportions
#No proportion data, so we'll just add it.
#I generate more data than we need, then cut off <0 and >1.
inventedproportions.prelim<-rnorm(mean=0.5,
                                  sd=0.5,
                                  n=300)
inventedproportions.keep.logical<-inventedproportions.prelim<=1&inventedproportions.prelim>=0
inventedproportions<-inventedproportions.prelim[inventedproportions.keep.logical]
airquality$inventedproportions<-inventedproportions[1:153]

#Generate zero-inflated data (download this package if necessary; I had never used it before).
library(gamlss.dist)

airquality$somanyzeros<-rZIP(n=153,
     mu=5,
     sigma=0.3)


str(airquality)
airquality<-na.omit(airquality)
str(airquality)

###########################################
#Brief review of graphical data exploration
###########################################

par(mfrow=c(2,3))
#Histograms for normality.
hist(airquality$Ozone)
hist(airquality$Solar.R)
hist(airquality$Wind)
hist(airquality$Temp)
hist(airquality$inventedproportions)
hist(airquality$somanyzeros)

#Residuals are what count for our models.
model.Solar.R<-lm(Ozone~Solar.R,
          data=airquality)
par(mfrow=c(2,2))
plot(model.Solar.R)
resid.model.Solar.R<-resid(model.Solar.R)
par(mfrow=c(1,1))
hist(resid.model.Solar.R)

model.somanyzeros<-lm(Ozone~somanyzeros,
                  data=airquality)
par(mfrow=c(2,2))
plot(model.somanyzeros)
resid.model.somanyzeros<-resid(model.somanyzeros)
hist(resid.model.somanyzeros)


###########################################
#Transforming your data
###########################################

airquality$ln.Solar.R<-log(airquality$Solar.R)
#add a constant if there are zeros. 
#pick a constant that transforms your variable best,
#see pp. 65-66 in Quinn and Keough for explanation.
airquality$ln.Solar.R.min<-log(airquality$Solar.R+min(airquality$Solar.R, na.rm=TRUE))
airquality$ln.Solar.R.001<-log(airquality$Solar.R+0.001)
airquality$ln.Solar.R.1<-log(airquality$Solar.R+1)
#Some other transformations.
airquality$Solar.R.inverted<-1/(airquality$Solar.R)
airquality$sqrt.Solar.R<-sqrt(airquality$Solar.R) #square root
airquality$cube.root.Solar.R<-airquality$Solar.R^(1/3) #cube root
airquality$fourth.root.Solar.R<-airquality$Solar.R^(1/4) #fourth root


#Box-Cox transformation.  R only lets you actually run this on a model.
#It's for transforming your response variable ONLY.
library(MASS)
par(mfrow=c(1,2))
Solar.R.boxcox<-boxcox(model.Solar.R)

#Use the graph to zoom in around the mostly likely lambda value.
Solar.R.boxcox.zoom<-boxcox(model.Solar.R,
                            lambda=seq(-0.05,0.35,0.1))
#From the plot, lambda should be about 0.15 (1/6 or 1/7, so a 6th or 7th root transformation.)
airquality$sixth.root.Ozone<-airquality$Ozone^(1/6)

airquality$arcsinsqrt.inventedproportions<-asin(sqrt(airquality$inventedproportions)) #arcsin square root transformation for proportions

#reflecting your data before transforming (pg. 66 in Quinn and Keough)
reflection.constant<-(max(airquality$somanyzeros)+1)
airquality$lnreflected.somanyzeros<-log((reflection.constant-airquality$somanyzeros)+0.001)
#Re-evaluating your data after transformation: run the exploratory analyses again.

par(mfrow=c(1,2))
hist(airquality$Ozone)
hist(airquality$sixth.root.Ozone)

par(mfrow=c(2,4))
hist(airquality$Solar.R)
hist(airquality$ln.Solar.R.min)
hist(airquality$ln.Solar.R.001)
hist(airquality$ln.Solar.R.1)
#Some other transformations.
hist(airquality$Solar.R.inverted)
hist(airquality$sqrt.Solar.R)
hist(airquality$cube.root.Solar.R)
hist(airquality$fourth.root.Solar.R)

par(mfrow=c(1,2))
hist(airquality$inventedproportions)
hist(airquality$arcsinsqrt.inventedproportions)

hist(airquality$somanyzeros)
hist(airquality$lnreflected.somanyzeros)

#Residuals are what count for our models.
model.sqrt.Solar.R<-lm(Ozone~sqrt.Solar.R,
                  data=airquality)
par(mfrow=c(2,2))
plot(model.sqrt.Solar.R)
resid.model.sqrt.Solar.R<-resid(model.sqrt.Solar.R)
par(mfrow=c(1,2))
hist(resid.model.sqrt.Solar.R)
#previous
hist(resid.model.Solar.R)

model.somanyzeros<-lm(Ozone~somanyzeros,
                      data=airquality)
par(mfrow=c(2,2))
plot(model.somanyzeros)
resid.model.somanyzeros<-resid(model.somanyzeros)


model.lnreflected.somanyzeros<-lm(Ozone~lnreflected.somanyzeros,
                      data=airquality)
par(mfrow=c(2,2))
plot(model.lnreflected.somanyzeros)
resid.model.lnreflected.somanyzeros<-resid(model.lnreflected.somanyzeros)

#compare
hist(resid.model.lnreflected.somanyzeros)
hist(resid.model.somanyzeros)
#The original is actually better... so remember to look at residuals!

###########################################
#Heteroscedasticity: inequality of variance
###########################################
#code from last lab, but with the new sample data.
pairs(~Ozone+Solar.R+Wind+Temp,
      data=airquality)

#density plots with our scatterplots
library(car)
scatterplotMatrix(~Ozone+Solar.R+Wind+Temp, #weird formula here, nothing as "response"/Y, and |indicates grouping variable.
                  data=airquality,
                  smoother=FALSE, #default is TRUE, draws a nonparametric smoother.
                  reg.line=FALSE, #default is TRUE, draws a regression line.
                  diagonal="density", #puts a density kernel line on the diagonal for each group.
                  # ellipse=TRUE,
                  # levels=c(0.5,0.95), #dataEllipse quartiles
                  pch=c(2,3,1), #rearrange the default points a bit
                  col=c(1,1,1)) #changed the colors to all black.

#tests for homogeneity of variance
#adapted from: http://www.cookbook-r.com/Statistical_analysis/Homogeneity_of_variance/

#These only work on groups, so group your Solar.R first.
airquality$Solar.R.cat<-NA
high<-airquality$Solar.R<=max(airquality$Solar.R, na.rm=TRUE)&airquality$Solar.R>=median(airquality$Solar.R, na.rm=TRUE)
airquality[high,"Solar.R.cat"]="high"

low<-airquality$Solar.R>=min(airquality$Solar.R, na.rm=TRUE)&airquality$Solar.R<median(airquality$Solar.R, na.rm=TRUE)
airquality[low,"Solar.R.cat"]="low"
airquality$Solar.R.cat<-as.factor(airquality$Solar.R.cat)

bartlett.test(Ozone~Solar.R.cat, #variable as a function of groups
              data=airquality )
?bartlett.test #gives options for other tests.

library(car)
leveneTest(Ozone~Solar.R.cat,
           data=airquality)
#Less sensitive than Bartlett's to non-normality.
###########################################
#Variations on regression
###########################################
#nonlinear relationships (adding a polynomial term), adapted from here.
#The I() function is necessary for R to read your quadratic or cubed term correctly.
quadratic.model<-lm(Ozone~Temp+I(Temp^2),
                    data=airquality)
summary(quadratic.model)

#cubic model
cubic.model<-lm(Ozone~Temp+I(Temp^2)+I(Temp^3),
                    data=airquality)
summary(cubic.model)

par(mfrow=c(1,1))
plot(airquality$Ozone~airquality$Temp)
ndQ<- data.frame("Temp"=seq(min(airquality$Temp),
                            max(airquality$Temp),
                            length.out=length(airquality$Temp)))

lines(ndQ$Temp,
      predict(quadratic.model,
              newdata=ndQ),
      lty="solid",
      col="black") 

lines(ndQ$Temp,
      predict(cubic.model,
              newdata=ndQ),
      lty="solid",
      col="red") 

anova(quadratic.model,cubic.model)

#weighted OLS regression
#Additional info.

#Group your data, or use existing groups.  (Here I just use the two groups from before for the airquality data.)

(var.solar<-tapply(airquality$Ozone, #the variable we're interested in
                      airquality$Solar.R.cat, #index (i.e., the grouping variable)
                      FUN=var, na.rm=TRUE))
solar.wts<-1/var.solar
airquality$wts<-NA
airquality[airquality$Solar.R.cat=="high","wts"]=solar.wts[1]
airquality[airquality$Solar.R.cat=="low","wts"]=solar.wts[2]

wt.lm<-lm(Ozone~Solar.R,
          weights=wts,
          data=airquality)
summary(wt.lm)
par(mfrow=c(2,2))
plot(wt.lm)
#compare with original
plot(model.Solar.R)
summary(model.Solar.R)

#Or run as ANOVA.
wt.anova<-lm(Ozone~Solar.R.cat,
             weights=wts,
             data=airquality)
summary(wt.anova)
plot(wt.anova)
#compare with unweighted.
solar.anova<-lm(Ozone~Solar.R.cat,
                data=airquality)
summary(solar.anova)
plot(solar.anova)

#For more details on how to do this (including normalizing your weights, which is apparently needed
#if you plan to get confidence intervals), check the top answer here.

###########################################
#Generalized linear models
###########################################
#functions, how to select family and link
glm.wolf<-glm(number~latitude,
              data=Depredations,
              family=poisson)
#uses default link for family.
summary(glm.wolf)
plot(glm.wolf)

par(mfrow=c(1,1))
plot(Depredations$number~Depredations$latitude)
ndW<- data.frame("latitude"=seq(min(Depredations$latitude),
                               max(Depredations$latitude),
                               length.out=length(Depredations$latitude)))
#plot the prediction with the new data (otherwise it uses rownumber and stretches the line out uselessly).
lines(ndW$latitude,
      predict.glm(glm.wolf,
                      newdata=ndW,
                  type="response"),
      lty="solid",lwd=2)

#GLM with a factor.
glm.snails<-glm(Deaths~Rel.Hum*Species,
                data=snails,
              family=poisson(link="log"))
#select the link manually, although still leaving it at default.
summary(glm.snails)

#quasipoisson suggested because very high deviance to df ratio.
glm.snails.q<-glm(Deaths~Rel.Hum*Species,
                data=snails,
                family=quasipoisson)
summary(glm.snails.q)

#plot the two lines.
plot(snails$Deaths[snails$Species=="A"]~jitter(snails$Rel.Hum[snails$Species=="A"]),
     pch="A",
     cex=0.5,
     xlim=c(60,80),
     ylim=c(0,20))
points(snails$Deaths[snails$Species=="B"]~jitter(snails$Rel.Hum[snails$Species=="B"]),
     pch="B",
     cex=0.5)

ndA<- data.frame("Rel.Hum"=seq(min(snails$Rel.Hum),
                            max(snails$Rel.Hum),
                            length.out=length(snails$Rel.Hum)),
                        "Species"="A")
lines(ndA$Rel.Hum,
      predict.glm(glm.snails.q,
                  newdata=ndA,
                  type="response"), #type="response" is necessary for these interesting links.
      lty="solid") 

ndB<- data.frame("Rel.Hum"=seq(min(snails$Rel.Hum),
                            max(snails$Rel.Hum),
                            length.out=length(snails$Rel.Hum)),
                 "Species"="B")
lines(ndB$Rel.Hum,
      exp(predict.glm(glm.snails.q,
                  newdata=ndB)),
      lty="dotted")
#Just to show that predict.glm is going the same thing as exp(prediction):
lines(ndB$Rel.Hum,
      predict.glm(glm.snails.q,
                  newdata=ndB,
                  type="response"),
      lty="solid",
      col="blue")

(predictions.compared<-data.frame(snails$Species,
                                 snails$Deaths,
                                 fitted(glm.snails.q),
                                 predict(glm.snails.q, type="response"),
                                 predict(glm.snails.q))) #note that without type="response" you get the linear predictors only.

#Run as an ANCOVA (analysis of covariance)
Anova(glm.snails.q, type=3)

#More info on Poisson regression specifically.

#evaluation of distribution fit using deviance (see lecture).
#Look at summary(yourmodel)
summary(glm.snails.q)
#Ratio of residual deviance to degrees of freedom as described in lecture.
#You can also look for packages that expand on this topic.

###########################################
#Nonparametric tests
###########################################
#Mann-Whitney/Wilcoxon tests.
wilcox.test(Ozone~Solar.R.cat,
            data=airquality)
#pairwise.wilcox.test() if you have paired measurements (i.e. repeated measurements), also known as the Signed-Ranks test.

#For two or MORE samples, use the Kruskal-Wallis rank sum test.
kruskal.test(Ozone~Month,
             data=airquality) #for two or more samples.
boxplot(Ozone~Month,
        data=airquality,
        xlab="Month",
        ylab="Ozone in some units")
#Opinions on post-hoc tests vary, but a few packages are available and easily findable via google.

No comments:

Post a Comment

Comments and suggestions welcome.