#import data file into R
data<-read.table("http://user.mendelu.cz/drapela/Forest_Biometry/Data/plots.txt",sep='\t',header=TRUE)

#########################################################################
#                            EXPLORATORY DATA ANALYSIS                  #
#########################################################################
par(mfrow=c(1,1))

##HISTOGRAMS

# selection of DBH of spruce to the variable "d13"
d13<-data[data$species=='SPRUCE',]$d13

# histogram of "d13" - y axis is scaled as a density
hist(d13, col="lightblue", main="Histogram DBH - spruce", probability=T,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
rug(d13)

# we add lines of experimental density(dashed blue) and density of normal distribution (solid red) 
xx<-seq(min(d13),max(d13),1)
lines(xx,dnorm(xx,mean(d13),sd(d13)),col="red")
lines(density(d13),col="blue",lty="dashed")

# the same plot with user-defined breaks and diameter class width
hist(d13,breaks=seq(110,730,by=20),ylim=c(0,0.008), col="lightblue", main="Histogram DBH - spruce",probability=T,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
xx<-seq(min(d13),max(d13),1)
lines(xx,dnorm(xx,mean(d13),sd(d13)),col="red")
lines(density(d13),col="blue",lty="dashed")

# the same plot but there are frequences instead of densities on y-axis
hist(d13,breaks=seq(110,730,by=20), col="lightblue", main="Histogram DBH - spruce",probability=F,ylab="frequency", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
xx<-seq(min(d13),max(d13),1)
lines(xx,20*length(d13)*dnorm(xx,mean(d13),sd(d13)),col="red")
dd13<-density(d13)
lines(dd13$x,20*length(d13)*dd13$y,col="blue",lty="dashed")


#histograms d13 for all tree species in one plot (with empirical density)
b.d13<-data[data$species=='BEECH',]$d13
p.d13<-data[data$species=='PINE',]$d13
o.d13<-data[data$species=='OAK',]$d13

par(mfrow=c(2,2))

hist(d13, col="lightblue", main="Histogram DBH - spruce",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
lines(density(d13), col="red", lty="dashed")

hist(b.d13, col="lightblue", main="Histogram DBH - beech",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
lines(density(b.d13), col="red", lty="dashed")

hist(o.d13, col="lightblue", main="Histogram DBH - oak",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
lines(density(o.d13), col="red", lty="dashed")


hist(p.d13, col="lightblue", main="Histogram DBH - pine",probability=TRUE,ylab="density", xlab=substitute(paste(d[13], "[mm]", sep=' ')))
lines(density(p.d13), col="red", lty="dashed")

par(mfrow=c(1,1))



##QUANTILE-QUANTILE PLOT (QQ)
qqnorm(d13)
qqline(d13)

library(car)
qqPlot(d13)

#QQ plots d13 for all tree species in one plot
par(mfrow=c(2,2))

qqnorm(d13,  main="QQ plot DBH - spruce")
qqline(d13, col="red")

qqnorm(b.d13,  main="QQ plot DBH - beech")
qqline(b.d13, col="red")

qqnorm(o.d13, main="QQ plot DBH - oak")
qqline(o.d13, col="red")

qqnorm(p.d13,  main="QQ plot DBH - pine")
qqline(p.d13, col="red")


## BOX PLOTS

# horizontal box plot
par(mfrow=c(2,1))

boxplot(d13~species, data= data, notch=TRUE, col="lightgreen", main="Box plot DBH", ylab = "d13 [mm]",horizontal=TRUE)
boxplot(h~species, data= data, notch=TRUE, col="lightgreen", main="Box plot height", ylab= "h[m]",horizontal=TRUE)
par(mfrow=c(1,1))

# "better" box plots
par(mfrow=c(1,2))
boxplot(d13~species, data= data, notch=TRUE, col="lightgreen", main="Box plot DBH", ylab = "d13[mm]",outcol="white")
points(jitter(as.numeric(factor(data$species))),data$d13,col="lightblue4",pch=1,cex=0.6)
points(c(1,2,3,4),tapply(data$d13,data$species,mean),pch=16,col="red")
boxplot(h~species, data= data, notch=TRUE, col="lightgreen", main="Box plot height", ylab= "h[m]",outcol="white")
points(jitter(as.numeric(factor(data$species))),data$h,col="lightblue4",pch=1,cex=0.6)
points(c(1,2,3,4),tapply(data$h,data$species,mean,na.rm=TRUE),pch=16,col="red")
par(mfrow=c(1,1))
## TESTS OF NORMALITY

#shapiro wilk's test of normality for d13
shapiro.test(data[data$species=='SPRUCE',]$d13)
shapiro.test(data[data$species=='BEECH',]$d13)
shapiro.test(data[data$species=='OAK',]$d13)
shapiro.test(data[data$species=='PINE',]$d13)








#############################################################
#                                     ESTIMATES OF PARAMETERS                                          #
#############################################################

## SIMPLE  R-FUNCTIONS FOR PARAMETER ESTIMATES

# mean for d13 by formula
mean_d13<-sum(data$d13)/length(data$d13)
mean_d13

# mean for d13 by R function
mean1_d13<-mean(data$d13)
mean1_d13

#mean for h by R function 
mean(data[data$species=='SPRUCE',]$h)
mean(data[data$species=='SPRUCE',]$h, na.rm=TRUE)

#mean for h by R function  with saving into variable "m"
m<-mean(data[data$species=='SPRUCE',]$h, na.rm=TRUE)



#mean for all species
mean<-tapply(data$h,data$species,mean,na.rm=T)
mean     

#sd and median for all species
sd<-tapply(data$h,data$species,sd,na.rm=T)
sd
median<-tapply(data$h,data$species,median,na.rm=T)
median

#table of results
res<-cbind(mean,median, sd)
res

#rounding of mean and sd to one decimal point
mean<-round(mean,1)
sd<-round(sd,1)
res1<-round(cbind(mean,median, sd),1)
res1 

## SOME COMMANDS THAT CREATE TABLES OF ALL DESCRIPTIVE PARAMETERS IN ONE STEP

#http://www.sthda.com/english/wiki/descriptive-statistics-and-graphics

#estimation of parameters by means of "psych" package
library(psych)
describe(d13)
describeBy(data$d13,data$species)


#estimation of parameters by means of "pastecs" package
library(pastecs)
data1<-cbind(data$d13, data$h,data$h_live,data$h_dead)
tab_res<-stat.desc(data1,norm=T)
tab_res<-round(tab_res,2)
tab_res



# TABLE OF FREQUENCES (ABSOLUTE, RELATIVE AND CUMULATIVE)
range.d13<-range(data$d13)
breaks.d13<-cut(data$d13,breaks=seq(120,1000,by=40),right=T)
absN.d13<-table(breaks.d13)
relN.d13<-round(prop.table(absN.d13),3)
relNPerc.d13<-100*relN.d13
cum.absN.d13<-cumsum(absN.d13)
cum.relN.d13<-cumsum(relN.d13)
cum.relNPerc.d13<-cumsum(relNPerc.d13)
table=cbind(absN.d13,relN.d13,relNPerc.d13,cum.absN.d13,cum.relNPerc.d13)
table
colnames(table)<-c("Abs.counts","Rel.counts","Rel.counts in %","Cumul.rel. counts","Cumul.rel.  counts in %")
print(table)


## USER - DEFINED FUNCTIONS

# R function for confidence interval

ci_mean<-function(sample, alpha=0.05, one_sided=FALSE)
{
  sample<-sample[!is.na(sample)]
  sample_mean<-mean(sample)
  n<-length(sample)
  sd<-sd(sample)
  delta <- ifelse(one_sided, qt(1-alpha, n-1)*sd/sqrt(n) , qt(1-alpha/2, n-1)*sd/sqrt(n) ) 
  c(sample_mean - delta, sample_mean + delta)
}

CI.d13<-ci_mean(data$d13)
CI.d13

library(DescTools)
MeanCI(data$d13,conf.level=0.90)

#plot of confidence intervals of d13- comparison among species
library(gplots)
plotmeans(data$d13~data$species)



#skewness - definition of R-function 

skew<-function(sample, is_population=FALSE)
{
  
  sample<-sample[!is.na(sample)]
  sample_mean<-mean(sample)
  n<-length(sample)
  ss<-sum((sample-sample_mean)^3)*sqrt(n)/sum((sample-sample_mean)^2)^(3/2)
  ifelse(is_population, ss, ss * sqrt(n*(n-1))/(n-2) )
  
}





#kurtosis - definition of R-function 

kurt<-function(sample, is_population=FALSE)
{
  
  sample<-sample[!is.na(sample)]
  sample_mean<-mean(sample)
  n<-length(sample)
  kk<-sum((sample-sample_mean)^4)*n/sum((sample-sample_mean)^2)^2
  ifelse(is_population, kk, (n-1)/(n-2)/(n-3)*( (n+1)*kk + 6 ) )
}

skew_d13<-skew(data$d13)
skew_d13

kurt_d13<-kurt(data$d13)
kurt_d13

skew_data<-cbind(skew(data$d13),skew(data$h))
skew_data


skewh<-tapply(data$h,data$species,skew)
skewd<-tapply(data$d13,data$species,skew)
kurth<-tapply(data$h,data$species,kurt)
kurtd<-tapply(data$d13,data$species,kurt)
res_shape<-round(cbind(skewd,kurtd,skewh,kurth),3)
res_shape




##BOX COX TRANSFORMATION

bc<-sample((data[data$species=='SPRUCE',]$d13),100,replace=TRUE)
hist(bc,probability=TRUE)
lines(density(bc),col="red")
shapiro.test(bc)

#skewness - definition of R-function 

skew<-function(sample, is_population=FALSE)
{  
  sample<-sample[!is.na(sample)]
  sample_mean<-mean(sample)
  n<-length(sample)
  ss<-sum((sample-sample_mean)^3)*sqrt(n)/sum((sample-sample_mean)^2)^(3/2)
  ifelse(is_population, ss, ss * sqrt(n*(n-1))/(n-2) )
}
kurt<-function(sample, is_population=TRUE)
{
  
  sample<-sample[!is.na(sample)]
  sample_mean<-mean(sample)
  n<-length(sample)
  kk<-sum((sample-sample_mean)^4)*n/sum((sample-sample_mean)^2)^2
  ifelse(is_population, kk, (n-1)/(n-2)/(n-3)*( (n+1)*kk + 6 ) )
}
skew_bc<-skew(bc)
skew_bc
kurt_bc<-kurt(bc)
kurt_bc

library(MASS)
library(car)
boxcox(bc~1, lambda=seq(-3,3,by=0.1))
bcp<-powerTransform(bc)
summary(bcp)

bclog<-log(bc)
bclam<-bcPower(bc,bcp$lambda)
#histogram of transformed data
par(mfrow=c(1,3))

hist(bc,col="lightblue",probability=TRUE)
lines(density(bc),col="red")		
lines(seq(min(bc, na.rm=TRUE)-100, max(bc, na.rm=TRUE)+100,0.05), dnorm(seq(min(bc, na.rm=TRUE)-100, max(bc,na.rm=TRUE)+100,0.05), mean(bc,  na.rm=TRUE), sd(bc, na.rm=TRUE) ), col="black")

hist(bclog, col="lightblue", main="Histogram of  \n log-transformed sample", probability=TRUE, ylab="hustota", xlab=expression(log(d[13])) )
lines(density(bclog), col="red", lty="dashed")
lines(seq(min(bclog, na.rm=TRUE)-1, max(bclog, na.rm=TRUE)+1,0.05), dnorm(seq(min(bclog, na.rm=TRUE)-1, max(bclog, na.rm=TRUE)+1,0.05), mean(bclog,  na.rm=TRUE), sd(bclog, na.rm=TRUE) ), col="black")

hist(bclam, col="lightblue", main="Histogram of  \n lambda-transformed sample",probability=TRUE, ylab="hustota", xlab="lambda transformed values" )
lines(density(bclam,ylim=25), col="red", lty="dashed")
lines(seq(min(bclam, na.rm=TRUE)-0.2, max(bclam, na.rm=TRUE)+0.2,0.001), dnorm(seq(min(bclam, na.rm=TRUE)-0.2, max(bclam, na.rm=TRUE)+0.2,0.001), mean(bclam,  na.rm=TRUE), sd(bclam, na.rm=TRUE) ), col="black")

par(mfrow=c(1,1))



# qq plots of original and transformed data		
par(mfrow=c(1,3))
qqnorm(bc,main="QQ plot of original sample")
qqline(bc,col="red")
qqnorm(bclog, main="QQ plot of \n log transformed sample")
qqline(bclog,col="red")
qqnorm(bclam, main="QQ plot of \n lambda transformed sample")
qqline(bclam,col="red")

#comparison of parameters (mean) for original, log-transformed and  Box-Cox-transformed data
library(pastecs)
data1<-cbind(bc, bclog,bclam)
options(scipen=100)
tab_res1<-stat.desc(data1,norm=T)
tab_res1<-round(tab_res1,2)
tab_res1
options(scipen=0)	





ci_mean<-function(sample, alpha=0.05, one_sided=FALSE)
{
  sample<-sample[!is.na(sample)]
  sample_mean<-mean(sample)
  n<-length(sample)
  sd<-sd(sample)
  delta <- ifelse(one_sided, qt(1-alpha, n-1)*sd/sqrt(n) , qt(1-alpha/2, n-1)*sd/sqrt(n) ) 
  c(sample_mean - delta, sample_mean + delta)
}

#confidence interval of mean  for original data
ci_mean(bc)

#confidence interval of mean  for log-transformed data
ci_mean(bclog)
exp(ci_mean(bclog))






#confidence interval of mean  for Box-Cox-transformed data
mlam<-mean(bclam)
cilam<-ci_mean(bclam)
mr<-(mlam*bcp$lambda+1)^(1/bcp$lambda)   #computation of  real  mean (in original units) from transformed data 
mr1<-(cilam[1]*bcp$lambda+1)^(1/bcp$lambda)   #computation of  lower limit of real  mean (in original units) from transformed data
mr2<-(cilam[2]*bcp$lambda+1)^(1/bcp$lambda)  #computation of  upper  limit of real  mean (in original units) from transformed data
mr1
mr	
mr2
mo<-cbind(ci_mean(bc)[1],mean(bc),ci_mean(bc)[2])
mlog<-cbind(exp(ci_mean(bclog)[1]),exp(mean(bclog)),exp(ci_mean(bclog)[2]))
mlam<-cbind(mr1,mr,mr2)
res<-rbind(mo,mlog,mlam)
res
colnames(res)<-c("lower","mean","upper")
rownames(res)<-c("original","log-transformed","B-C-transformed")
print(res)





#histogram of sample with Ci for untransformed and CI for transformed data
par(mfrow=c(1,1))

hist(bc, col="lightblue", main="Histogram of original sample and CIs for original and transformed data", probability=TRUE, ylab="density", xlab=expression(d[13]) )
lines(density(bc), col="red", lty="dashed")
lines(seq(min(bc, na.rm=TRUE)-1, max(bc, na.rm=TRUE)+1,0.05), dnorm(seq(min(bc, na.rm=TRUE)-1, max(bc,na.rm=TRUE)+1,0.05), mean(bc,na.rm=TRUE),      sd(bc,na.rm=TRUE)), col="black")

abline(v=mean(bc, na.rm=TRUE),col="black", lwd=3)                         # mean of original data
abline(v=median(bc, na.rm=TRUE),col="green", lwd=3)                     # median of original data
abline(v=ci_mean(bc), col="black", lty="dotted", lwd=2)                   # CI of original data

abline(v=exp(mean(bclog, na.rm=TRUE)),col="red", lwd=3)              # mean of log-transformed data
abline(v=exp(ci_mean(bclog)), col="red", lty="dotted", lwd=2)        # CI of log-transformed data

abline(v=mr,col="blue", lwd=3)                                                               # mean of B-C-transformed data
abline(v=mr1, col="blue", lty="dotted", lwd=2)                                   # CI of B-C-transformed data – lower-limit  
abline(v=mr2, col="blue", lty="dotted", lwd=2)                                   # CI of B-C-transformed data – upper-limit  

legend("topright",c("original mean", "original median","CI orig.data mean","log-trans. mean"," CI log-trans. mean ", "BC trans. mean","CI BC mean"),col=c
       ("black","green","black","red","red","blue","blue"),lty=c(1,1,3,1,3,1,3),lwd=c(2,2,1,2,1,2,1), cex=0.6)