## POISSON AND NEGATIVE BINOMIAL DISTRIBUTION ## data input, names of variables and dimensions of the data set stems<- read.table('D:/EUFOR/data/stems.csv',header=TRUE,sep=',') names(stems) dim(stems) ## the table function can be used to obtain the frequency distribution of the quadrat counts. table(stems$num.stems)->stem.counts stem.counts ## the category names that appear in the table can be extracted with the names function names(stem.counts) ## the presence of quotes around the numbers indicates that they're being treated as character ## values rather than as numbers. We can convert them back to numbers with the as.numeric ## function as.numeric(names(stem.counts)) ##plot of frequences - with "plot" command plot(as.numeric(names(stem.counts)), stem.counts, type='h', xlab='Quadrat Counts', ylab='Frequency of Occurrence', lwd=4, col='gray60') ## histogram of frequences - incorrect (left) and correct (right) par(mfrow=c(1,2)) hist(stems$num.stems,breaks=0:64,xlab='Quadrat counts',col='gray70',main='Incorrect breakpoints') hist(stems$num.stems, breaks=-0.5:64.5,xlab='Quadrat counts', col='gray70', main='Correct breakpoints') par(mfrow=c(1,1)) ## fitting by Poisson distribution library(MASS) poisson.fit<-fitdistr(stems$num.stems,'Poisson') poisson.fit names(poisson.fit) poisson.fit$estimate poisson.fit$sd poisson.fit$n poisson.fit$loglik ## to obtain the predicted Poisson probabilities we use the dpois function eprobP<-dpois(0:64,poisson.fit$estimate) ## to obtain the expected frequencies under the Poisson model we must multiply these ## probabilities by the total number of observations (312) dpois(0:64,poisson.fit$estimate)*length(stems$num.stems) ## to see how close these estimates are to the actual values, ## we superimpose the estimates on the histogram of the observed counts hist(stems$num.stems,breaks=-0.5:64.5,xlab='Quadrat counts', col='gray70', main='Number of Paulownia stems') lines(0:64, dpois(0:64,poisson.fit$estimate)*length(stems$num.stems), col=2) points(0:64, dpois(0:64,poisson.fit$estimate)*length(stems$num.stems), col=2, pch=16,cex=.6) ## an alternative to the Poisson distribution for count data is the negative binomial ## distribution. NB.fit<- fitdistr(stems$num.stems,'negative binomial') NB.fit$estimate ## to obtain the expected counts we multiply the negative binomial probabilities by the number ## of observations ( 2 paramteters!!) dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])*length(stems$num.stems) ## to check the fit we add these results to the histogram of observed counts as blue points ## connected by line segments lines(0 :64,dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])*length (stems$num.stems), col=4) points(0 :64,dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])*length (stems$num.stems), col=4, pch=16, cex=.6) ## CHI QUADRAT LACK OF FIT TEST FOR NEGATIVE BINOMIAL DISTRIBUTION ## sum of probabilities does not sum to 1, we must add additional category sum(dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2])) eprob<-c(dnbinom(0:64,size=NB.fit$estimate[1], mu=NB.fit$estimate[2]), 1-pnbinom(64, size=NB.fit$estimate[1], mu=NB.fit$estimate[2])) sum(eprob) ## we obtain the expected counts by multiplying the expected probabilities by the sample size ecounts<-eprob*length(stems$num.stems) ## we must find how many categories we must combine to achieve minimum frequency of 5. Firstly ## we use function "cumsum" for cumulative sums in reverse order. b ## to clarify which cumulative counts correspond to which categories in their original order we ## append the original order of the cells as a row above the cumulative counts using "rbind" rbind(66:1,round(cumsum(rev(ecounts)),2)) ## moving from the tail in we see that 5 is exceeded for the first time at entry 42. According ## to category number this corresponds to category 25. Thus if we sum categories 25 through 66 ## we should just exceed 5 sum(ecounts[25:66]) ## in the same way we must combine some other categories (18-24, 14-17, 11-13, 9-10) Ei<-c(ecounts[1:8],sum(ecounts[9:10]), sum(ecounts[11:13]), sum(ecounts[14:17]), sum (ecounts[18:24]), sum(ecounts[25:66])) Ei ## now we need to repeat this for the observed counts so that the categories match. Currently ## not all count categories are represented among the observed counts. We must add the missing ## categories, inserting a value of zero for their counts stem.counts ## first make a data frame out of the variable stem.counts, the observed count frequencies. We ## need two columns: the category labels (as numbers) and the count frequencies. We also use ## the "as.numeric" function on the counts to prevent R from inserting an extra column of ## labels. o1<-data.frame(as.numeric(stem.counts),as.numeric(names(stem.counts))) ## change the column labels so that the categories are called "count". colnames(o1)<-c('frequency','count') ## next we make a data frame out of the numbers 0 through 65 and call the single column ## "count", the same name I used for the counts in variable o1. o2<-data.frame(0:65) colnames(o2)<-c('count') ## finally we combine the two data frames case by case using the "merge" function o3<-merge(o1,o2,all=TRUE) o3 ## we convert the missing values to zeros o3$frequency<-ifelse(is.na(o3$frequency),0,o3$frequency) o3 ## Since the categories of o3$frequency are now exactly the same as they are in the vector ## ecounts from above, I can use the code that created Ei above to now create the observed ## categories Oi. I just recall that line to command prompt and change ecounts to o3$frequency ## everywhere. Oi<-c(o3$frequency[1:8],sum(o3$frequency[9:10]), sum(o3$frequency[11:13]), sum(o3 $frequency[14:17]), sum(o3$frequency[18:24]), sum(o3$frequency[25:66])) ## plot for comparison of expected and observed frequences plot(c(1:13),Oi,type="p") points(c(1:13),Ei,col="red") ## Pearson test pearson<-sum((Oi-Ei)^2/Ei) pearson ## p-value 1-pchisq(pearson,df=length(Oi)-1-2) ## randomization Version of the Lack of Fit Test chisq.test(o3$frequency,p=eprob,simulate.p.value=TRUE) chisq.test(o3$frequency,p=eprob,simulate.p.value=TRUE,B=10000) ## checking residuals - the squared residuals are the individual terms of the Pearson chi- ## squared test chisq.test(o3$frequency,p=eprob,simulate.p.value=TRUE)->out.chi round(out.chi$residuals^2,2) ## Pearson test for Poisson distribution sum(dpois(0:64,poisson.fit$estimate)) eprobP<-dpois(0:64,poisson.fit$estimate) oP<-o3$frequency[-66] chisq.test(oP,p=eprobP,simulate.p.value=TRUE) chisq.test(oP,p=eprobP,simulate.p.value=TRUE,B=10000)