#import data file into R data<-read.table("http://user.mendelu.cz/drapela/Forest_Biometry/Data/plots.txt",sep='\t',header=TRUE) ######################################################################### # EXPLORATORY DATA ANALYSIS # ######################################################################### par(mfrow=c(1,1)) ##HISTOGRAMS # selection of DBH of spruce to the variable "d13" d13<-data[data$species=='SPRUCE',]$d13 # histogram of "d13" - y axis is scaled as a density hist(d13, col="lightblue", main="Histogram DBH - spruce", probability=T,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' '))) rug(d13) # we add lines of experimental density(dashed blue) and density of normal distribution (solid red) xx<-seq(min(d13),max(d13),1) lines(xx,dnorm(xx,mean(d13),sd(d13)),col="red") lines(density(d13),col="blue",lty="dashed") # the same plot with user-defined breaks and diameter class width hist(d13,breaks=seq(110,730,by=20),ylim=c(0,0.008), col="lightblue", main="Histogram DBH - spruce",probability=T,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' '))) xx<-seq(min(d13),max(d13),1) lines(xx,dnorm(xx,mean(d13),sd(d13)),col="red") lines(density(d13),col="blue",lty="dashed") # the same plot but there are frequences instead of densities on y-axis hist(d13,breaks=seq(110,730,by=20), col="lightblue", main="Histogram DBH - spruce",probability=F,ylab="frequency", xlab=substitute(paste(d[13], "[mm]", sep=' '))) xx<-seq(min(d13),max(d13),1) lines(xx,20*length(d13)*dnorm(xx,mean(d13),sd(d13)),col="red") dd13<-density(d13) lines(dd13$x,20*length(d13)*dd13$y,col="blue",lty="dashed") #histograms d13 for all tree species in one plot (with empirical density) b.d13<-data[data$species=='BEECH',]$d13 p.d13<-data[data$species=='PINE',]$d13 o.d13<-data[data$species=='OAK',]$d13 par(mfrow=c(2,2)) hist(d13, col="lightblue", main="Histogram DBH - spruce",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' '))) lines(density(d13), col="red", lty="dashed") hist(b.d13, col="lightblue", main="Histogram DBH - beech",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' '))) lines(density(b.d13), col="red", lty="dashed") hist(o.d13, col="lightblue", main="Histogram DBH - oak",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' '))) lines(density(o.d13), col="red", lty="dashed") hist(p.d13, col="lightblue", main="Histogram DBH - pine",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' '))) lines(density(p.d13), col="red", lty="dashed") par(mfrow=c(1,1)) ##QUANTILE-QUANTILE PLOT (QQ) qqnorm(d13) qqline(d13) library(car) qqPlot(d13) #QQ plots d13 for all tree species in one plot par(mfrow=c(2,2)) qqnorm(d13, main="QQ plot DBH - spruce") qqline(d13, col="red") qqnorm(b.d13, main="QQ plot DBH - beech") qqline(b.d13, col="red") qqnorm(o.d13, main="QQ plot DBH - oak") qqline(o.d13, col="red") qqnorm(p.d13, main="QQ plot DBH - pine") qqline(p.d13, col="red") ## BOX PLOTS # horizontal box plot par(mfrow=c(2,1)) boxplot(d13~species, data= data, notch=TRUE, col="lightgreen", main="Box plot DBH", ylab = "d13 [mm]",horizontal=TRUE) boxplot(h~species, data= data, notch=TRUE, col="lightgreen", main="Box plot height", ylab= "h[m]",horizontal=TRUE) par(mfrow=c(1,1)) # "better" box plots par(mfrow=c(1,2)) boxplot(d13~species, data= data, notch=TRUE, col="lightgreen", main="Box plot DBH", ylab = "d13[mm]",outcol="white") points(jitter(as.numeric(factor(data$species))),data$d13,col="lightblue4",pch=1,cex=0.6) points(c(1,2,3,4),tapply(data$d13,data$species,mean),pch=16,col="red") boxplot(h~species, data= data, notch=TRUE, col="lightgreen", main="Box plot height", ylab= "h[m]",outcol="white") points(jitter(as.numeric(factor(data$species))),data$h,col="lightblue4",pch=1,cex=0.6) points(c(1,2,3,4),tapply(data$h,data$species,mean,na.rm=TRUE),pch=16,col="red") par(mfrow=c(1,1)) ## TESTS OF NORMALITY #shapiro wilk's test of normality for d13 shapiro.test(data[data$species=='SPRUCE',]$d13) shapiro.test(data[data$species=='BEECH',]$d13) shapiro.test(data[data$species=='OAK',]$d13) shapiro.test(data[data$species=='PINE',]$d13) ############################################################# # ESTIMATES OF PARAMETERS # ############################################################# ## SIMPLE R-FUNCTIONS FOR PARAMETER ESTIMATES # mean for d13 by formula mean_d13<-sum(data$d13)/length(data$d13) mean_d13 # mean for d13 by R function mean1_d13<-mean(data$d13) mean1_d13 #mean for h by R function mean(data[data$species=='SPRUCE',]$h) mean(data[data$species=='SPRUCE',]$h, na.rm=TRUE) #mean for h by R function with saving into variable "m" m<-mean(data[data$species=='SPRUCE',]$h, na.rm=TRUE) #mean for all species mean<-tapply(data$h,data$species,mean,na.rm=T) mean #sd and median for all species sd<-tapply(data$h,data$species,sd,na.rm=T) sd median<-tapply(data$h,data$species,median,na.rm=T) median #table of results res<-cbind(mean,median, sd) res #rounding of mean and sd to one decimal point mean<-round(mean,1) sd<-round(sd,1) res1<-round(cbind(mean,median, sd),1) res1 ## SOME COMMANDS THAT CREATE TABLES OF ALL DESCRIPTIVE PARAMETERS IN ONE STEP #http://www.sthda.com/english/wiki/descriptive-statistics-and-graphics #estimation of parameters by means of "psych" package library(psych) describe(d13) describeBy(data$d13,data$species) #estimation of parameters by means of "pastecs" package library(pastecs) data1<-cbind(data$d13, data$h,data$h_live,data$h_dead) tab_res<-stat.desc(data1,norm=T) tab_res<-round(tab_res,2) tab_res # TABLE OF FREQUENCES (ABSOLUTE, RELATIVE AND CUMULATIVE) range.d13<-range(data$d13) breaks.d13<-cut(data$d13,breaks=seq(120,1000,by=40),right=T) absN.d13<-table(breaks.d13) relN.d13<-round(prop.table(absN.d13),3) relNPerc.d13<-100*relN.d13 cum.absN.d13<-cumsum(absN.d13) cum.relN.d13<-cumsum(relN.d13) cum.relNPerc.d13<-cumsum(relNPerc.d13) table=cbind(absN.d13,relN.d13,relNPerc.d13,cum.absN.d13,cum.relNPerc.d13) table colnames(table)<-c("Abs.counts","Rel.counts","Rel.counts in %","Cumul.rel. counts","Cumul.rel. counts in %") print(table) ## USER - DEFINED FUNCTIONS # R function for confidence interval ci_mean<-function(sample, alpha=0.05, one_sided=FALSE) { sample<-sample[!is.na(sample)] sample_mean<-mean(sample) n<-length(sample) sd<-sd(sample) delta <- ifelse(one_sided, qt(1-alpha, n-1)*sd/sqrt(n) , qt(1-alpha/2, n-1)*sd/sqrt(n) ) c(sample_mean - delta, sample_mean + delta) } CI.d13<-ci_mean(data$d13) CI.d13 library(DescTools) MeanCI(data$d13,conf.level=0.90) #plot of confidence intervals of d13- comparison among species library(gplots) plotmeans(data$d13~data$species) #skewness - definition of R-function skew<-function(sample, is_population=FALSE) { sample<-sample[!is.na(sample)] sample_mean<-mean(sample) n<-length(sample) ss<-sum((sample-sample_mean)^3)*sqrt(n)/sum((sample-sample_mean)^2)^(3/2) ifelse(is_population, ss, ss * sqrt(n*(n-1))/(n-2) ) } #kurtosis - definition of R-function kurt<-function(sample, is_population=FALSE) { sample<-sample[!is.na(sample)] sample_mean<-mean(sample) n<-length(sample) kk<-sum((sample-sample_mean)^4)*n/sum((sample-sample_mean)^2)^2 ifelse(is_population, kk, (n-1)/(n-2)/(n-3)*( (n+1)*kk + 6 ) ) } skew_d13<-skew(data$d13) skew_d13 kurt_d13<-kurt(data$d13) kurt_d13 skew_data<-cbind(skew(data$d13),skew(data$h)) skew_data skewh<-tapply(data$h,data$species,skew) skewd<-tapply(data$d13,data$species,skew) kurth<-tapply(data$h,data$species,kurt) kurtd<-tapply(data$d13,data$species,kurt) res_shape<-round(cbind(skewd,kurtd,skewh,kurth),3) res_shape ##BOX COX TRANSFORMATION bc<-sample((data[data$species=='SPRUCE',]$d13),100,replace=TRUE) hist(bc,probability=TRUE) lines(density(bc),col="red") shapiro.test(bc) #skewness - definition of R-function skew<-function(sample, is_population=FALSE) { sample<-sample[!is.na(sample)] sample_mean<-mean(sample) n<-length(sample) ss<-sum((sample-sample_mean)^3)*sqrt(n)/sum((sample-sample_mean)^2)^(3/2) ifelse(is_population, ss, ss * sqrt(n*(n-1))/(n-2) ) } kurt<-function(sample, is_population=TRUE) { sample<-sample[!is.na(sample)] sample_mean<-mean(sample) n<-length(sample) kk<-sum((sample-sample_mean)^4)*n/sum((sample-sample_mean)^2)^2 ifelse(is_population, kk, (n-1)/(n-2)/(n-3)*( (n+1)*kk + 6 ) ) } skew_bc<-skew(bc) skew_bc kurt_bc<-kurt(bc) kurt_bc library(MASS) library(car) boxcox(bc~1, lambda=seq(-3,3,by=0.1)) bcp<-powerTransform(bc) summary(bcp) bclog<-log(bc) bclam<-bcPower(bc,bcp$lambda) #histogram of transformed data par(mfrow=c(1,3)) hist(bc,col="lightblue",probability=TRUE) lines(density(bc),col="red") lines(seq(min(bc, na.rm=TRUE)-100, max(bc, na.rm=TRUE)+100,0.05), dnorm(seq(min(bc, na.rm=TRUE)-100, max(bc,na.rm=TRUE)+100,0.05), mean(bc, na.rm=TRUE), sd(bc, na.rm=TRUE) ), col="black") hist(bclog, col="lightblue", main="Histogram of \n log-transformed sample", probability=TRUE, ylab="hustota", xlab=expression(log(d[13])) ) lines(density(bclog), col="red", lty="dashed") lines(seq(min(bclog, na.rm=TRUE)-1, max(bclog, na.rm=TRUE)+1,0.05), dnorm(seq(min(bclog, na.rm=TRUE)-1, max(bclog, na.rm=TRUE)+1,0.05), mean(bclog, na.rm=TRUE), sd(bclog, na.rm=TRUE) ), col="black") hist(bclam, col="lightblue", main="Histogram of \n lambda-transformed sample",probability=TRUE, ylab="hustota", xlab="lambda transformed values" ) lines(density(bclam,ylim=25), col="red", lty="dashed") lines(seq(min(bclam, na.rm=TRUE)-0.2, max(bclam, na.rm=TRUE)+0.2,0.001), dnorm(seq(min(bclam, na.rm=TRUE)-0.2, max(bclam, na.rm=TRUE)+0.2,0.001), mean(bclam, na.rm=TRUE), sd(bclam, na.rm=TRUE) ), col="black") par(mfrow=c(1,1)) # qq plots of original and transformed data par(mfrow=c(1,3)) qqnorm(bc,main="QQ plot of original sample") qqline(bc,col="red") qqnorm(bclog, main="QQ plot of \n log transformed sample") qqline(bclog,col="red") qqnorm(bclam, main="QQ plot of \n lambda transformed sample") qqline(bclam,col="red") #comparison of parameters (mean) for original, log-transformed and Box-Cox-transformed data library(pastecs) data1<-cbind(bc, bclog,bclam) options(scipen=100) tab_res1<-stat.desc(data1,norm=T) tab_res1<-round(tab_res1,2) tab_res1 options(scipen=0) ci_mean<-function(sample, alpha=0.05, one_sided=FALSE) { sample<-sample[!is.na(sample)] sample_mean<-mean(sample) n<-length(sample) sd<-sd(sample) delta <- ifelse(one_sided, qt(1-alpha, n-1)*sd/sqrt(n) , qt(1-alpha/2, n-1)*sd/sqrt(n) ) c(sample_mean - delta, sample_mean + delta) } #confidence interval of mean for original data ci_mean(bc) #confidence interval of mean for log-transformed data ci_mean(bclog) exp(ci_mean(bclog)) #confidence interval of mean for Box-Cox-transformed data mlam<-mean(bclam) cilam<-ci_mean(bclam) mr<-(mlam*bcp$lambda+1)^(1/bcp$lambda) #computation of real mean (in original units) from transformed data mr1<-(cilam[1]*bcp$lambda+1)^(1/bcp$lambda) #computation of lower limit of real mean (in original units) from transformed data mr2<-(cilam[2]*bcp$lambda+1)^(1/bcp$lambda) #computation of upper limit of real mean (in original units) from transformed data mr1 mr mr2 mo<-cbind(ci_mean(bc)[1],mean(bc),ci_mean(bc)[2]) mlog<-cbind(exp(ci_mean(bclog)[1]),exp(mean(bclog)),exp(ci_mean(bclog)[2])) mlam<-cbind(mr1,mr,mr2) res<-rbind(mo,mlog,mlam) res colnames(res)<-c("lower","mean","upper") rownames(res)<-c("original","log-transformed","B-C-transformed") print(res) #histogram of sample with Ci for untransformed and CI for transformed data par(mfrow=c(1,1)) hist(bc, col="lightblue", main="Histogram of original sample and CIs for original and transformed data", probability=TRUE, ylab="density", xlab=expression(d[13]) ) lines(density(bc), col="red", lty="dashed") lines(seq(min(bc, na.rm=TRUE)-1, max(bc, na.rm=TRUE)+1,0.05), dnorm(seq(min(bc, na.rm=TRUE)-1, max(bc,na.rm=TRUE)+1,0.05), mean(bc,na.rm=TRUE), sd(bc,na.rm=TRUE)), col="black") abline(v=mean(bc, na.rm=TRUE),col="black", lwd=3) # mean of original data abline(v=median(bc, na.rm=TRUE),col="green", lwd=3) # median of original data abline(v=ci_mean(bc), col="black", lty="dotted", lwd=2) # CI of original data abline(v=exp(mean(bclog, na.rm=TRUE)),col="red", lwd=3) # mean of log-transformed data abline(v=exp(ci_mean(bclog)), col="red", lty="dotted", lwd=2) # CI of log-transformed data abline(v=mr,col="blue", lwd=3) # mean of B-C-transformed data abline(v=mr1, col="blue", lty="dotted", lwd=2) # CI of B-C-transformed data – lower-limit abline(v=mr2, col="blue", lty="dotted", lwd=2) # CI of B-C-transformed data – upper-limit legend("topright",c("original mean", "original median","CI orig.data mean","log-trans. mean"," CI log-trans. mean ", "BC trans. mean","CI BC mean"),col=c ("black","green","black","red","red","blue","blue"),lty=c(1,1,3,1,3,1,3),lwd=c(2,2,1,2,1,2,1), cex=0.6)