# PREPARATION OF THE DATA
# BREAST HEIGHT DIAMETER AND HEIGHT OF PINE 
################################################################################

#import data file into R
data<-read.table("http://user.mendelu.cz/drapela/Forest_Biometry/Data/plots.txt",sep='\t',header=TRUE)
data$id<-paste(data$id_plot, data$id_tree, sep="-")
head(data)
tmp1<-data[,c(9,3,4,5)]
head(tmp1)
tmp1$d13<-tmp1$d13/10
tmp2<-tmp1[tmp1$species=='PINE',]
head(tmp2)
pine<-tmp2[!is.na(tmp2$h), ]
head(pine)
pine<-pine[,-2]
head(pine)

#  SIMPLE LINEAR MODEL BETWEEN ONE RESPONSE AND ONE EXPLANATORY VARIABLE
#  RELATIONSHIP BETWEEN DBH AND HEIGHT
##########################################################################

#scatterplot

par(mfrow=c(1,1))
plot(pine$h~pine$d13, xlab="d13 [cm]", ylab="h [m]")

#better scatterplot from car library with boxplots
library(car)
scatterplot(pine$d13,pine$h,  id.n=5, lwd =2,lty.smooth = 5, lty.spread =3,lwd.smooth=1,lwd.spread=1)


# BASIC MEASURES OF CORRELATION

# covariance

#by formula
sum((pine$d13-mean(pine$d13)) * (pine$h-mean(pine$h)))/ (length(pine$d13)-1)
#by function
cov(pine$h,pine$d13)


#Pearson correlation coeficient

#by formula	
sum((pine$d13-mean(pine$d13)) * (pine$h-mean(pine$h)))/ (length(pine$d13)-1) / sd(pine$d13)/sd			(pine$h)
#by function
cor(pine$h, pine$d13)


#Spearman correlation coeficient

cor(pine$h,pine$d13,  method="spearman")


#test of significance of corr.coef.

cor.test(pine$h,pine$d13 )


# PARAMETER ESTIMATION OF LINEAR MODEL

# parameters of linear model

lm_01<-lm(pine$h~pine$d13)


#possible outputs of model "lm_01"

lm_01
summary(lm_01)
anova(lm_01)

names(lm_01)
lm_01$coefficients
lm_01$coefficients[1]
lm_01$coefficients[2]

out<-summary(lm_01)
names(out)
out$sigma
out$r.squared


#only coefficients of the model

coef(lm_01)


#residuals of the model

residuals(lm_01)[1:20]


#fitted values of the model

fitted(lm_01)[1:20]


#confidence intervals of model parameters

confint(lm_01)



#plot of the model with measured and model values

plot(pine$h~pine$d13, xlab="diameters [cm]", ylab="heights[m]")
abline(lm_01, col="blue", lty="dashed")

plot(pine$h~pine$d13, xlab="diameters [cm]", ylab="heights[m]")
abline(a=13.287,b=0.30478, col="red", lty="dashed")


#computing of confidence a prediction intervals of the model

tmp<-data.frame(d13=pine[order(pine$d13),2])
head(tmp)
tmp_ci<-predict.lm(lm_01, new=tmp,interval="confidence")
tmp_pi<-predict.lm(lm_01, new= tmp, interval="prediction")
head(tmp_ci)
head(tmp_pi)
ci<-data.frame(cbind(tmp_ci, tmp_pi[,-1]))
cii<-ci[order(ci$fit),]
head(cii)

#plot of confidence a prediction intervals of the model


plot(pine$h~pine$d13, xlab="diameters [cm]", ylab="heights [m]" )
matlines(tmp$d13, cii, lty=c("solid", "dashed", "dashed", "dashed", "dashed"), col=c("blue", "red", "red", "green", "green"))

# confidence and prediction intervals - function "ci.plot" - package "HH"
pine1=data.frame(x=pine$d13,y=pine$h)
lm1=lm(y ~ x, data=pine1)
ci.plot(lm1)



#diagnostics of residuals

res<-residuals(lm_01)


#box plot

boxplot(res, main="Box plot for residuals of the model lm_01", notch=TRUE, col="lightgreen",boxwex=0.3)
abline(0,0,col="red")


#histogram

hist(res, col="lightblue", main="Histogram of residuals of the model lm_01",ylim=c(0,0.3),probability=TRUE,ylab="density", xlab="residuals")
lines(density(res), col="red", lty="dashed")
res1<-seq(min(res)-2,max(res)+2,0.1)
lines(res1,dnorm(res1, mean(res), sd(res)),col="green")

#QQ plot

qqnorm(res, main="QQ plot of residuals for model lm_01")
qqline(res, col="red")

#statistics of residuals

library(pastecs)
data1<-res
options(scipen=100)
tab_res<-stat.desc(data1,norm=T)
tab_res<-round(tab_res,2)
tab_res
options(scipen=0)


#test of normality of residuals

shapiro.test(res)


# Breusch-Pagan test against heteroskedasticity

library(lmtest)
bptest(lm_01)

#autocorrelation test - Durbin Watson test

dwtest(lm_01,alternative="two.sided")


# saving of influential points according to several measures 

infl<-influence.measures(lm_01)


# list of "suspected" values (possible influential points)

summary(infl)


#bubble plot of influential plot
influencePlot(lm_01, main="Influence Plot",
              sub="Circle size is proportial to Cook's Distance", id.n=5 )


# 4 plots of various measures of influnetial points with indentification of 5 most "suspected" points

influenceIndexPlot(lm_01,id.method="y",id.n=5)


#selected diagnostic plots for linear model

plot(lm_01)