Representation of Data and Frequency Distributions
Koushik Khan
Tuesday, September 09, 2014
Problem Set 1, (a)
states<-c("Punjab","Haryana","Tamil Nadu","Andhra Pradesh",
"Uttar Pradesh","Others")
procurement<-c(5486,1248,589,3987,1296,1654)
rice.proc<-data.frame(states,procurement)
rice.proc<-transform(rice.proc,percentage=(rice.proc$procurement/14260)*100)
rice.proc
## states procurement percentage
## 1 Punjab 5486 38.471
## 2 Haryana 1248 8.752
## 3 Tamil Nadu 589 4.130
## 4 Andhra Pradesh 3987 27.959
## 5 Uttar Pradesh 1296 9.088
## 6 Others 1654 11.599
attach(rice.proc)
pie(procurement,labels=c("Punjab(38.47%)","Haryana(8.75%)","Tamil Nadu(4.13%)",
"Andhra Pradesh(27.96%)","Uttar Pradesh(9.09%)","Others(11.60%)"),
main="Procurement of Rice('000 tonnes)\nduring Oct. 1993 to Sept. 1994",
col=c("royalblue4","orchid","chocolate4","olivedrab3","orange3","purple1","violetred4"))
detach(rice.proc)
Comment: Procurement of rice is maximum (38.47%) for Punjab during the given time period, this implies that there was a big crisis for rice in Punjab i.e. production of rice is very poor. Similarly procurement is minimum (4.13%)for Tamil Nadu implying a good production of rice there for that period.
Problem Set 1, (b)
consumption<-matrix(c(6.93,4.70,0.95,0.56,0.52,5.45,4.84,0.42,0.19,0.15),
nrow=2,ncol=5,byrow=T)
rownames(consumption)<-c("Rural","Urban")
colnames(consumption)<-c("Rice","Wheat","Jowar","Bazra","Maize")
consumption
## Rice Wheat Jowar Bazra Maize
## Rural 6.93 4.70 0.95 0.56 0.52
## Urban 5.45 4.84 0.42 0.19 0.15
barplot(consumption,beside=T,main="Consumption of cereals (in kg./month)\n
during 1989-'90",legend.text=rownames(consumption))
Comment: Except wheat, consumption is higher for other cereals for rural India than the urban. Even it is remarkably higher for rice. This indicate rural Indians are mostly dependent on rice. Wheat is consumed in more or less same quantity in rural & urban areas. In some parts rural peoples consumed Jowar, Bazra & Maize in a relatively small amount than rice. But the overall consumption is higher for rural people, which indicates that rural Indians are dependent over these cereals, where as urban Indians are familiar with modern foods.
Problem Set 2, (a)
obv<-c(11.5,8.8,10.1,8.2,9.3,10,9.7,10.1,10.3,10.3,11.3,9.8,
10.7,9.8,10.7,9.8,9.3,8.6,10.4,9.8,11.3,8.4,9,10.7,9.6,11.2)
freq.tab<-table(obv) #creates frequency table
freq<-unname(freq.tab)
x<-as.numeric(names(freq.tab)) #set of distinct observations
am=sum(x*freq)/sum(freq) #arithmatic mean
am
## [1] 9.95
gm=prod(x^freq)^(1/sum(freq)) #geometric mean
gm
## [1] 9.91
hm=sum(freq)/sum(freq/x) #harmonic mean
hm
## [1] 9.869
mode<-x[which(freq==max(freq))] #obs. with max. frequency
mode
## [1] 9.8
obv<-sort(obv) #ordered sample
n<-length(obv)
median<-(obv[n/2]+obv[(n/2)+1])/2
median
## [1] 9.9
std.dev<-sqrt(sum((obv-mean(obv))^2)/n)
std.dev
## [1] 0.8863
md1<-sum(abs(obv-10))/n #mean dev. about 10
md1
## [1] 0.7115
md2<-sum(abs(obv-mean(obv)))/n #mean dev. about mean
md2
## [1] 0.7115
range<-max(obv)-min(obv) #range
range
## [1] 3.3
quantile(obv)
## 0% 25% 50% 75% 100%
## 8.200 9.375 9.900 10.625 11.500
u<-unname(quantile(obv))
qrtl.dev<-u[4]-u[2] #quartile deviation
qrtl.dev
## [1] 1.25
m1<-sum(x*freq)/sum(freq) #1st order raw moment
m1
## [1] 9.95
m2<-sum(x^2*freq)/sum(freq)#2nd order raw moment
m2
## [1] 99.79
m3<-sum(x^3*freq)/sum(freq)#3rd order raw moment
m3
## [1] 1008
m4<-sum(x^4*freq)/sum(freq)#4th order raw moment
m4
## [1] 10266
M2<-(m2-m1^2) #2nd order central moment
M2
## [1] 0.7856
M3<-m3-(3*m2*m1)+(2*m1^3) #3rd order central moment
M3
## [1] -0.096
M4<-m4-(4*m3*m1)+(6*m2*m1^2)-(3*m1^4) #4th order central moment
M4
## [1] 1.434
g1<-M3/(std.dev^3) #1st measure of skewness
g1
## [1] -0.1379
b1<-g1^2 #2nd measure of skewness
b1
## [1] 0.01901
g2<-(M4/(std.dev^4))-3 #1st measure of kurtosis
g2
## [1] -0.6759
b2<-g2+3 #2nd measure of kurtosis
b2
## [1] 2.324
Comment: On the basis of g1 & g2 we can say that the distribution is negatively skewed and platykurtic.
Problem Set 2, (b)
x<-1:8 #observations
freq<-c(21,25,43,51,40,39,12,10)
obv<-rep(x,freq)
am=sum(x*freq)/sum(freq) #arithmatic mean
am
## [1] 4.158
gm=prod(x^freq)^(1/sum(freq)) #geometric mean
gm
## [1] 3.675
hm=sum(freq)/sum(freq/x) #harmonic mean
hm
## [1] 3.088
mode<-x[which(freq==max(freq))] #obs. with max. frequency
mode
## [1] 4
obv<-sort(obv) #ordered sample
n<-length(obv)
median<-(obv[n/2]+obv[(n/2)+1])/2
median
## [1] 4
std.dev<-sqrt(sum((obv-mean(obv))^2)/n)
std.dev
## [1] 1.811
md1<-sum(abs(obv-10))/n #mean dev. about 10
md1
## [1] 5.842
md2<-sum(abs(obv-mean(obv)))/n #mean dev. about mean
md2
## [1] 1.478
range<-max(obv)-min(obv) #range
range
## [1] 7
quantile(obv)
## 0% 25% 50% 75% 100%
## 1 3 4 6 8
u<-unname(quantile(obv))
qrtl.dev<-u[4]-u[2] #quartile deviation
qrtl.dev
## [1] 3
m1<-sum(x*freq)/sum(freq) #1st order raw moment
m1
## [1] 4.158
m2<-sum(x^2*freq)/sum(freq)#2nd order raw moment
m2
## [1] 20.56
m3<-sum(x^3*freq)/sum(freq)#3rd order raw moment
m3
## [1] 113.3
m4<-sum(x^4*freq)/sum(freq)#4th order raw moment
m4
## [1] 673.3
M2<-(m2-m1^2) #2nd order central moment
M2
## [1] 3.278
M3<-m3-(3*m2*m1)+(2*m1^3) #3rd order central moment
M3
## [1] 0.5451
M4<-m4-(4*m3*m1)+(6*m2*m1^2)-(3*m1^4) #4th order central moment
M4
## [1] 25.47
g1<-M3/(std.dev^3) #1st measure of skewness
g1
## [1] 0.09184
b1<-g1^2 #2nd measure of skewness
b1
## [1] 0.008434
g2<-(M4/(std.dev^4))-3 #1st measure of kurtosis
g2
## [1] -0.6294
b2<-g2+3 #2nd measure of kurtosis
b2
## [1] 2.371
Comment: Comment: On the basis of the measures g1 & g2 we can say that this distribution is positively skewed and platykurtic.
Problem Set 2, (c)
height<-seq(147.05,182.05,5)
freq<-c(1,3,24,58,60,27,2,2)
a<-rep(height,freq)
hist(a,main="Histogram of freq. distn. of height
\nof 177 Indian adult males",ylim=c(0,70),xlab="Height(cm.)",
ylab="Frequency Density",col="darkseagreen2")
mean<-sum(height*freq)/sum(freq) #A.M
mean
## [1] 164.7
dfrm<-data.frame(height,freq)
dfrm<-transform(dfrm,cumfreq.l=cumsum(freq))
attach(dfrm)
## The following objects are masked _by_ .GlobalEnv:
##
## freq, height
me.c<-min(which(dfrm$cumfreq.l>=88.5))#'mc' for median class
me.l=164.55;me.u=169.55 #class boundaries of median class
f0=freq[me.c] #freq. of median class
cl.width=5 #class width
median<-me.l+(sum(freq)/2-cumfreq.l[me.c-1])*cl.width/f0 #Median
median
## [1] 164.8
mo.c<-which(freq==max(freq)) #modal class
mo.l<-height[mo.c]-2.5;mo.u<-height[mo.c]+2.5 #cls. boundaries of modal cls.
mode<-mo.l+
(((freq[mo.c]-freq[mo.c-1])*cl.width)/(2*freq[mo.c]-freq[mo.c-1]-freq[mo.c+1])) #Mode
mode
## [1] 164.8
Comment: Mean, median and mode are more or less equal, it implies that the frequency distribution of height of 177 adult Indian males is more or less symmetric. A histogram of this distribution gives the graphical proof.
Problem Set 2, (d)
word.length<-c(5,4,3,5,8,6,6,3,4,3,4,4,5,8,2,6,7,6,4,5,6,4,9,6,4,2,
2,2,9,2,3,3,3,2,4,7,7,2,4,4,4,3,4,4,2,4,4,9,3,7,4,5,12,6,3,5,2,5,10,
3,5,7,3,3,3,6,2,5,3,3,3,2,4,5,8,5,3,4,4,6,7,2,3,5,5,5,3,2,4,5)
fr.tab<-table(word.length) #frequency table
fr.tab
## word.length
## 2 3 4 5 6 7 8 9 10 12
## 13 19 20 15 9 6 3 3 1 1
freq<-unname(fr.tab) #extracts the frequencies only
freq
## [1] 13 19 20 15 9 6 3 3 1 1
wl<-as.numeric(names(fr.tab))
plot(wl,freq,type="h",lwd=10,main="Column Diagram for Word Length Data",
xlab="word length",ylab="Frequency")
plot(wl,freq,type="l",lwd=2,main="Frequency Polygon for Word Length Data",
xlab="Word length",ylab="Frequency")
grid(10,10,col="black",lty=1,lwd=1.25)
cum.fr.l<-cumsum(freq)
cum.fr.g<-rev(cumsum(rev(freq)))
plot(wl,cum.fr.l,type="s",main="Cumulative Frequency Diagram\nWord length Data",
xlab="Word length",ylab="Cum. freq",col="red")
lines(wl,cum.fr.g,type="s",col="blue",lty=1,lwd=1)
legend(7.5,45,legend=c("less than type","greater than type"),
col=c("red","blue"),lty=c(1,1),lwd=c(1,1))
Problem Set 2, (e)
height<-seq(147.05,182.05,5)
freq<-c(1,3,24,58,60,27,2,2)
b<-rep(height,freq)
hist(b,freq=T,main="Histogram of freq. distn. of height
\nof 177 Indian adult males",ylim=c(0,70),xlab="Height(cm.)",
ylab="Frequency Density",col="orange")
Comment: The frequency distribution of 177 Indian adult males seems to be more or less symmetric height measure 165 cm.
Problem Set 3,(d)
library(car)
res<-rchisq(50,df=5)
par(mfrow=c(1,2))
qqPlot(res,"norm",mean=0,sd=1,main="Normal Q-Q Plot",xlab="Theoretical Quantiles",
ylab="Sample Quantiles",col="dark green",envelope=F)
box()
qqPlot(res,"chisq",df=5,main="Chi-squared Q-Q Plot",xlab="Theoretical Quantiles",
ylab="Sample Quantiles",col="dark green",envelope=F)
box()
par(mfrow=c(1,1))
Problem Set 4,(a)
#Hypergeometric Distribution
N=50;p=0.08;m=10
(a1<-rhyper(50,N*p,N*(1-p),m))
## [1] 0 2 3 1 0 4 0 0 0 1 1 1 1 2 3 0 1 1 2 2 0 1 1 1 0 0 0 0 1 1 1 0 1 0 2
## [36] 2 2 0 0 1 0 1 0 1 0 0 1 0 1 0
hist(a1,prob=T,breaks=40,main="Hypergeometric Distribution")
#Binomial Distribution
(a2=rbinom(50,size=10,prob=0.08))
## [1] 1 1 0 1 0 0 0 1 1 1 1 1 3 1 0 0 2 0 1 2 1 0 1 0 1 2 2 1 1 0 1 0 1 0 1
## [36] 1 0 1 1 3 0 2 2 0 0 0 1 2 1 1
hist(a2,prob=T,breaks=40,main="Binomial Distribution")
#Poisson Distribution
(a3=rpois(50,lambda=5))
## [1] 5 6 8 0 7 11 2 6 1 5 4 1 5 3 2 4 1 6 13 3 4 4 4
## [24] 4 5 6 3 4 2 4 7 2 6 6 7 7 3 4 7 4 3 6 5 5 5 6
## [47] 6 2 5 3
hist(a3,prob=T,breaks=40,main="Poisson Distribution")
#Negative binomial Distribution
(a4=rnbinom(50,size=2,p=0.3))
## [1] 3 3 4 4 1 3 0 4 2 5 4 6 12 8 1 4 10 2 2 2 0 0 1
## [24] 0 4 4 5 4 1 6 2 1 4 3 4 12 7 4 4 4 2 3 2 2 0 5
## [47] 8 5 2 2
hist(a4,prob=T,breaks=40,main="Negative binomial distribution")
#Uniform Distribution
(a5=runif(50,min=-1,max=1))
## [1] 0.83248 0.43611 0.64405 0.12942 -0.83009 0.36332 -0.88719
## [8] -0.12689 -0.89908 0.23645 0.36525 -0.65092 -0.92070 0.31048
## [15] -0.11837 0.38925 0.61704 0.07788 0.28311 0.71714 0.48361
## [22] 0.39368 -0.15009 0.73115 0.55297 -0.76352 -0.41193 -0.28702
## [29] -0.07498 -0.19975 0.44213 0.51524 0.01614 -0.28080 0.07127
## [36] 0.10289 0.48865 -0.78649 -0.67313 0.57839 -0.46182 -0.53311
## [43] -0.61379 0.01316 0.21253 0.11320 0.03991 0.31441 -0.62265
## [50] -0.39472
hist(a5,prob=T,breaks=40,main="Uniform distribution")
#Exponential Distribution
(a6=rexp(50,rate=1))
## [1] 0.021322 2.818142 1.665370 1.127690 0.408367 3.226427 0.010526
## [8] 3.191992 1.392076 1.246930 0.084053 1.352184 1.077719 0.069680
## [15] 0.852086 0.215419 0.919785 1.619913 0.190371 0.316022 1.819239
## [22] 1.070554 1.150554 1.368658 0.049182 1.527954 2.501414 0.433714
## [29] 1.749039 3.569746 2.025098 0.682150 0.991434 0.482948 0.427279
## [36] 0.492884 0.838371 0.884066 0.684527 0.733355 0.497777 0.425395
## [43] 0.678024 0.407726 0.003668 0.529263 0.100722 0.165243 3.525454
## [50] 2.918235
hist(a6,prob=T,breaks=40,main="Exponential distribution")
#Gamma Distribution
(a7=rgamma(50,shape=2,scale=1/0.5))
## [1] 1.5194 8.7323 5.0112 5.3808 2.5689 3.7560 0.9024 5.1878
## [9] 8.8795 6.7652 0.8639 6.3527 7.8125 3.7566 5.7860 10.5738
## [17] 5.1313 3.2750 0.7868 5.2295 1.1398 7.9196 2.8757 4.4374
## [25] 0.2272 2.0854 1.7733 5.8107 3.2011 7.2478 9.2464 1.9531
## [33] 1.8189 4.5381 0.4502 3.8431 4.6504 6.3185 3.8842 3.1025
## [41] 9.7407 7.6229 1.4624 5.3390 4.3490 4.4336 11.1100 5.0727
## [49] 4.0605 0.9832
hist(a7,prob=T,breaks=40,main="Gamma distribution")
#Normal Distribution
(a8=rnorm(50,mean=2,sd=0.9))
## [1] 2.26133 1.62027 -0.25955 1.93170 2.20045 1.64113 2.28744
## [8] 2.98327 2.25620 0.52294 2.50472 2.34229 -0.43761 2.01171
## [15] 2.82413 2.04092 1.68098 2.44506 2.20083 3.61741 3.81555
## [22] 0.61219 2.37247 -0.42067 1.55787 1.34497 2.30083 1.32917
## [29] 1.44494 2.91677 2.75784 0.92687 1.19043 1.39576 3.05951
## [36] 2.09502 1.23137 0.41385 0.10059 1.50195 2.27822 2.87851
## [43] -0.04716 0.49346 2.35194 1.96581 1.51957 2.16626 1.56421
## [50] 2.31095
hist(a1,prob=T,breaks=40,main="Norma distribution, N(2,0.81)")
(rnorm(50,mean=2,sd=0.5))
## [1] 1.1892 2.8363 2.5936 2.3212 1.9049 2.5503 2.5789 2.4428 2.2551 2.4952
## [11] 1.0741 2.6539 1.9142 1.9648 0.9494 1.7294 1.9934 1.8712 1.8771 2.6616
## [21] 2.7913 2.4826 2.6862 2.0696 1.2455 2.0866 2.3095 2.5894 2.2323 2.2689
## [31] 2.4324 1.8221 1.9627 3.2848 2.6822 1.4466 2.2866 2.9505 1.9575 2.6219
## [41] 2.9105 1.9332 2.1261 1.7391 2.3877 1.7545 2.2162 2.0320 1.8253 1.5066
(rnorm(50,mean=2,sd=1.5))
## [1] 1.876242 6.037625 -0.451847 1.373633 0.532232 3.510680 3.340926
## [8] -0.007715 0.088956 4.453317 2.911617 0.921625 1.311006 -0.743663
## [15] 2.808198 2.514291 3.509133 2.187453 1.499554 1.807960 2.455559
## [22] 1.474674 1.485533 0.930782 2.142526 3.338360 1.575877 1.907328
## [29] 2.005079 2.448050 2.132124 1.944677 3.466303 -0.168645 3.908298
## [36] 0.821826 2.516435 5.170779 2.320600 3.440130 1.425027 2.446219
## [43] 1.769183 2.097993 1.625106 2.596603 0.977233 -0.519476 1.495581
## [50] -0.776313
#Log-Normal Distribution
(a9=rlnorm(50,meanlog=0,sdlog=1))
## [1] 0.3005 1.7389 13.1347 4.2291 0.6998 1.8213 0.2785 2.1887
## [9] 1.4761 0.6681 0.8359 1.4582 0.3899 3.0194 1.4543 1.4540
## [17] 0.4870 3.6922 1.5046 0.7333 0.4288 1.6527 0.6560 0.8621
## [25] 0.4482 0.3156 1.4359 1.5796 5.8439 0.8201 4.7405 1.3964
## [33] 2.5449 4.5905 1.6255 2.0441 0.8883 0.2330 1.0264 0.7197
## [41] 0.4117 0.9750 0.5959 2.5151 0.5787 1.2945 0.3047 1.4198
## [49] 1.4584 0.5964
hist(a9,prob=T,breaks=40,main="Log-Normal distribution")
#Beta Distribution
(a10=rbeta(50,shape1=1.5,shape2=1.4))
## [1] 0.66212 0.61942 0.20544 0.95167 0.91171 0.95260 0.40890 0.54824
## [9] 0.29276 0.55729 0.81590 0.56886 0.56937 0.03990 0.26728 0.46295
## [17] 0.33556 0.06203 0.06079 0.41014 0.48468 0.47872 0.24377 0.76013
## [25] 0.56997 0.27136 0.29022 0.51441 0.44391 0.87780 0.85589 0.09426
## [33] 0.54265 0.15311 0.46802 0.70016 0.17142 0.86240 0.23439 0.54585
## [41] 0.58151 0.69536 0.46311 0.64763 0.05720 0.85511 0.55383 0.55512
## [49] 0.19872 0.68647
hist(a10,prob=T,breaks=40,main="Beta distribution, Beta(1.5,1.4)")
(rbeta(50,shape1=1.1,shape2=1.4))
## [1] 0.32144 0.96026 0.42660 0.36804 0.62129 0.82827 0.94386 0.62923
## [9] 0.30584 0.92903 0.55857 0.96414 0.31448 0.14435 0.84991 0.69109
## [17] 0.20793 0.49479 0.51844 0.41784 0.24067 0.20343 0.52068 0.38265
## [25] 0.51435 0.32633 0.16032 0.64395 0.42573 0.22513 0.13632 0.04522
## [33] 0.71912 0.23443 0.51307 0.44959 0.54495 0.18560 0.17618 0.21729
## [41] 0.33008 0.28586 0.66376 0.16088 0.39746 0.92421 0.14569 0.85733
## [49] 0.06345 0.73436
(rbeta(50,shape1=1.5,shape2=1.1))
## [1] 0.22491 0.97405 0.57967 0.47966 0.42494 0.20674 0.88001 0.28277
## [9] 0.70045 0.62073 0.79499 0.03310 0.53514 0.63557 0.91030 0.21056
## [17] 0.05013 0.30046 0.79456 0.49406 0.86579 0.98466 0.43654 0.14420
## [25] 0.82091 0.87598 0.33492 0.90129 0.06676 0.90037 0.78858 0.45385
## [33] 0.15637 0.62456 0.45036 0.79109 0.47455 0.77335 0.76393 0.62468
## [41] 0.20989 0.96819 0.32499 0.99826 0.79783 0.44038 0.40867 0.63830
## [49] 0.64034 0.56435
No comments:
Post a Comment