## POISSON AND NEGATIVE BINOMIAL DISTRIBUTION



## data input, names of variables and dimensions of the data set

	stems<- read.table('D:/EUFOR/data/stems.csv',header=TRUE,sep=',')
	names(stems)
	dim(stems)



## the table function can be used to obtain the frequency distribution of the quadrat counts.

	table(stems$num.stems)->stem.counts
 	stem.counts



## the category names that appear in the table can be extracted with the names function

	names(stem.counts)


## the presence of quotes around the numbers indicates that they're being treated as character ## values rather than as numbers. We can convert them back to numbers with the as.numeric  ## function

	as.numeric(names(stem.counts))



##plot of frequences - with "plot" command

	plot(as.numeric(names(stem.counts)), stem.counts, type='h',
		xlab='Quadrat Counts', ylab='Frequency of Occurrence', lwd=4, col='gray60')


## histogram of frequences - incorrect (left) and correct (right)

	par(mfrow=c(1,2))
	hist(stems$num.stems,breaks=0:64,xlab='Quadrat counts',col='gray70',main='Incorrect 		breakpoints')
	hist(stems$num.stems, breaks=-0.5:64.5,xlab='Quadrat counts', col='gray70', 		main='Correct breakpoints')
	par(mfrow=c(1,1))



## fitting by Poisson distribution

	library(MASS)
	poisson.fit<-fitdistr(stems$num.stems,'Poisson')
	poisson.fit
	names(poisson.fit)
	poisson.fit$estimate
	poisson.fit$sd
	poisson.fit$n
	poisson.fit$loglik


## to obtain the predicted Poisson probabilities we use the dpois function

	eprobP<-dpois(0:64,poisson.fit$estimate)


## to obtain the expected frequencies under the Poisson model we must multiply these ## probabilities by the total number of observations (312)

	dpois(0:64,poisson.fit$estimate)*length(stems$num.stems)


## to see how close these estimates are to the actual values, 
## we superimpose the estimates on the histogram of the observed counts

	hist(stems$num.stems,breaks=-0.5:64.5,xlab='Quadrat counts', col='gray70', main='Number 		of Paulownia stems')
	lines(0:64, dpois(0:64,poisson.fit$estimate)*length(stems$num.stems), col=2)
	points(0:64, dpois(0:64,poisson.fit$estimate)*length(stems$num.stems), col=2, 				pch=16,cex=.6)



## an alternative to the Poisson distribution for count data is the negative binomial ## distribution.

	NB.fit<- fitdistr(stems$num.stems,'negative binomial')
	NB.fit$estimate


## to obtain the expected counts we multiply the negative binomial probabilities by the number ## of observations ( 2 paramteters!!)

	dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])*length(stems$num.stems)


## to check the fit we add these results to the histogram of observed counts as blue points ## connected by line segments

	lines(0 :64,dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])*length				(stems$num.stems), col=4)
	points(0 :64,dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])*length		(stems$num.stems), col=4, pch=16, cex=.6)



## CHI QUADRAT LACK OF FIT TEST FOR NEGATIVE BINOMIAL DISTRIBUTION


## sum of probabilities does not sum to 1, we must add additional category

	sum(dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2]))
	eprob<-c(dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2]), 1-pnbinom(64, 		size=NB.fit$estimate[1], mu=NB.fit$estimate[2]))
	sum(eprob)


## we obtain the expected counts by multiplying the expected probabilities by the sample size

	ecounts<-eprob*length(stems$num.stems)


## we must find how many categories we must combine to achieve minimum frequency of 5. Firstly ## we use function "cumsum" for cumulative sums in reverse order.

	b

## to clarify which cumulative counts correspond to which categories in their original order we ## append the original order of the cells as a row above the cumulative counts using "rbind" 

	rbind(66:1,round(cumsum(rev(ecounts)),2))


## moving from the tail in we see that 5 is exceeded for the first time at entry 42. According ## to category number this corresponds to category 25. Thus if we sum categories 25 through 66 ## we should just exceed 5

	sum(ecounts[25:66])


## in the same way we must combine some other categories (18-24, 14-17, 11-13, 9-10) 

	Ei<-c(ecounts[1:8],sum(ecounts[9:10]), sum(ecounts[11:13]), sum(ecounts[14:17]), sum		(ecounts[18:24]), sum(ecounts[25:66]))
	Ei
	

## now we need to repeat this for the observed counts so that the categories match. Currently ## not all count categories are represented among the observed counts. We must add the missing ## categories, inserting a value of zero for their counts

	stem.counts


## first make a data frame out of the variable stem.counts, the observed count frequencies. We ## need two columns: the category labels (as numbers) and the count frequencies. We also use  ## the "as.numeric" function on the counts to prevent R from inserting an extra column of ## labels.

	o1<-data.frame(as.numeric(stem.counts),as.numeric(names(stem.counts)))


## change the column labels so that the categories are called "count".

	colnames(o1)<-c('frequency','count')


## next we make a data frame out of the numbers 0 through 65 and call the single column ## "count", the same name I used for the counts in variable o1. 

	o2<-data.frame(0:65)
	colnames(o2)<-c('count')


## finally we combine the two data frames case by case using the "merge" function

	o3<-merge(o1,o2,all=TRUE)
	o3


## we convert the missing values to zeros

	o3$frequency<-ifelse(is.na(o3$frequency),0,o3$frequency)
	o3


## Since the categories of o3$frequency are now exactly the same as they are in the vector 
## ecounts from above, I can use the code that created Ei above to now create the observed 
## categories Oi. I just recall that line to command prompt and change ecounts to o3$frequency ## everywhere.

	Oi<-c(o3$frequency[1:8],sum(o3$frequency[9:10]), sum(o3$frequency[11:13]), sum(o3		$frequency[14:17]), sum(o3$frequency[18:24]), sum(o3$frequency[25:66]))



## plot for comparison of expected and observed frequences
	
	plot(c(1:13),Oi,type="p")
	points(c(1:13),Ei,col="red")


## Pearson test

	pearson<-sum((Oi-Ei)^2/Ei)
	pearson


## p-value

	1-pchisq(pearson,df=length(Oi)-1-2)


## randomization Version of the Lack of Fit Test 

	chisq.test(o3$frequency,p=eprob,simulate.p.value=TRUE)
	chisq.test(o3$frequency,p=eprob,simulate.p.value=TRUE,B=10000)



## checking residuals - the squared residuals are the individual terms of the Pearson chi-
## squared test
	chisq.test(o3$frequency,p=eprob,simulate.p.value=TRUE)->out.chi
	round(out.chi$residuals^2,2)


## Pearson test for Poisson distribution

	sum(dpois(0:64,poisson.fit$estimate))
	eprobP<-dpois(0:64,poisson.fit$estimate)
	oP<-o3$frequency[-66]
	chisq.test(oP,p=eprobP,simulate.p.value=TRUE)
	chisq.test(oP,p=eprobP,simulate.p.value=TRUE,B=10000)