#######################
#Data used in analysis#      
#######################
#data are downloaded from the "National Archive of Criminal Justice Data" website (http://www.icpsr.umich.edu/NACJD/) and then cleaned as needed
#obs is the matrix of observations, 3*5 matrix

source("data.txt")

#####################
#R code for analysis#
#####################

#classic model: to get the prior information for the Bayesian model

classic_model<-function(obs)
{#a stores the complete data (y) matrix
 a<-matrix(0,17,2) 
 
 #store the observed data from obs (x) to y matrix
 a[1,1]<-obs[1,1] 
 a[1,2]<-obs[1,2]
 a[3,1]<-obs[2,1]
 a[3,2]<-obs[2,2]
 a[5,1]<-obs[3,1]
 a[5,2]<-obs[3,2]
 a[6,1]<-obs[4,1]
 a[6,2]<-obs[4,2]
 a[7,2]<-obs[5,2]
 
 #use the mean to filling in the missing data
 for (i in c(2,4,7))
 a[i,1]<-obs[5,1]/3

 for (i in c(8,11,14,16))
 {j<-(i-8)/3+1  
  a[i,1]<-obs[j,3]/2
  a[i,2]<-obs[j,3]/2
  }
  
 tmp<-obs[5,3]/10
 a[9,1]<-tmp
 a[12,1]<-tmp

 for(i in c(10,13,15,17))
 {a[i,1]<-tmp
  a[i,2]<-tmp
 }


 #iteration

 n<-0                #iteration times
 swit<-1             #control the iteration
 opo<-0              #store the previously estimated probability
 oto<-0
 oro<-0
 odo<-0
 opr<-0
 opd<-0
 ope<-0
 opp<-0
 opn<-0
 ops<-0
 opsb<-0             #initial values for probabilities

while(swit==1 && n<=1000)     
{ 
 #calculate probabilities
 npsum<-sum(a[1:7,])   #number of phone interview
 psum<-sum(a[8:17,])   #number of personal interview
 po<-psum/(npsum+psum) #p hat
 
 rape<-a[1,1]+a[8,1]+a[10,1] #number of rapes that should have been reported when spouse is present
 nrape<-a[2,1]+a[9,1]        #number of rapes not reported because of spouse present
 ro<-rape/(rape+nrape)       #rho hat

 dom<-a[3,1]+a[11,1]+a[13,1] #number of domestic violence that should have been reported when spous is present
 ndom<-a[4,1]+a[12,1]        #number of domestic violences not reported because of spouse present
 doo<-dom/(dom+ndom)         #delta hat

 c<-apply(a,1,sum)
 to<-(c[8]+c[11]+c[14])/(c[8]+c[11]+c[14]+c[10]+c[13]+c[15])   #t hat

 s<-rep(0,2)                 #Assume the independence of w matrix, store the column probability
 p<-rep(0,5)                 #store the row probability
 w<-matrix(0,5,2)
 tot<-sum(a)
 s[1]<-sum(a[,1])/tot        #ps hat: probability that spouse is present during the interview
 s[2]<-sum(a[,2])/tot        #psb hat: probability that spouse is not present during the interview
 p[1]<-(c[1]+c[2]+c[8]+c[9]+c[10])/tot    #probability of rape
 p[2]<-(c[3]+c[4]+c[11]+c[12]+c[13])/tot  #probability of domestic violence
 p[3]<-(c[5]+c[14]+c[15])/tot             #probability of other assault
 p[4]<-(c[6]+c[16])/tot                   #probability of personal larceny
 p[5]<-(c[7]+c[17])/tot                   #probability of no crime reported
 for (i in 1:5)               #use the independence assumption for the cell probabilities
 for (j in 1:2)
 {w[i,j]<-p[i]*s[j]}          #w hat(i,j)

 #check convergence
 b<-sum(abs(opo-po),abs(oro-ro),abs(odo-doo),abs(oto-to),abs(opr-p[1]),abs(opd-p[2]),abs(ope-p[3]),abs(opp-p[4]),abs(opn-p[5]),abs(ops-s[1]))
 if (b<=0.0001)               #diagonose as converged when the sum of absolute difference is smaller than 0.0001
 swit<-0

 #store the estimated probabilities of this iteration for next iteration
 opo<-po
 oro<-ro
 odo<-doo
 oto<-to
 opr<-p[1]
 opd<-p[2]
 ope<-p[3]
 opp<-p[4]
 opn<-p[5]
 ops<-s[1]
 
 #obtain the updated complete data counts: E-step
 w1<-(1-po)*(1-ro)*w[1,1]
 w2<-(1-po)*(1-doo)*w[2,1]
 w3<-(1-po)*w[5,1]
 tmp<-obs[5,1]/(w1+w2+w3)
 a[2,1]<-tmp*w1
 a[4,1]<-tmp*w2
 a[7,1]<-tmp*w3

 w1<-po*to*ro*w[1,1]
 w2<-po*to*w[1,2]
 tmp<-obs[1,3]/(w1+w2)
 a[8,1]<-w1*tmp
 a[8,2]<-w2*tmp

 w1<-po*to*doo*w[2,1]
 w2<-po*to*w[2,2]
 tmp<-obs[2,3]/(w1+w2)
 a[11,1]<-w1*tmp
 a[11,2]<-w2*tmp

 w1<-po*to*w[3,1]
 w2<-po*to*w[3,2]
 tmp<-obs[3,3]/(w1+w2)
 a[14,1]<-w1*tmp
 a[14,2]<-w2*tmp

 w1<-po*w[4,1]
 w2<-po*w[4,2]
 tmp<-obs[4,3]/(w1+w2)
 a[16,1]<-w1*tmp
 a[16,2]<-w2*tmp

 w1<-po*(1-ro)*w[1,1]
 w2<-po*(1-to)*ro*w[1,1]
 w3<-po*(1-to)*w[1,2]
 w4<-po*(1-doo)*w[2,1]
 w5<-po*doo*(1-to)*w[2,1]
 w6<-po*(1-to)*w[2,2]
 w7<-po*(1-to)*w[3,1]
 w8<-po*(1-to)*w[3,2]
 w9<-po*w[5,1]
 w10<-po*w[5,2]
 tmp<-obs[5,3]/(w1+w2+w3+w4+w5+w6+w7+w8+w9+w10)
 a[9,1]<-tmp*w1
 a[10,1]<-tmp*w2
 a[10,2]<-tmp*w3
 a[12,1]<-tmp*w4
 a[13,1]<-tmp*w5
 a[13,2]<-tmp*w6
 a[15,1]<-tmp*w7
 a[15,2]<-tmp*w8
 a[17,1]<-tmp*w9
 a[17,2]<-tmp*w10

 n<-n+1 #count the iteration number
}

result<-list(n=n,w=w,rate=apply(w,1,sum)*1000,po=po,to=to,ro=ro,doo=doo,y=a,s=s,p=p)
result
}

#Example
classic_model(obs1)                    #93-97 unweighted data
classic_model(obs2)                    #93-97 weighted data

#####################################################################
#emperical bayesian model

bayesian_model<-function(obs,pro,pdoo,pto)     
#pro,pdoo and pto are prior information: the estimated rates from the old data
{#a stores the complete data (y) matrix 
 a<-matrix(0,17,2) 
 
 #store the observed data from obs (x) to y matrix
 a[1,1]<-obs[1,1] 
 a[1,2]<-obs[1,2]
 a[3,1]<-obs[2,1]
 a[3,2]<-obs[2,2]
 a[5,1]<-obs[3,1]
 a[5,2]<-obs[3,2]
 a[6,1]<-obs[4,1]
 a[6,2]<-obs[4,2]
 a[7,2]<-obs[5,2]
 
 #use the mean to filling in the missing data
 for (i in c(2,4,7))
 a[i,1]<-obs[5,1]/3

 for (i in c(8,11,14,16))
 {j<-(i-8)/3+1  
  a[i,1]<-obs[j,3]/2
  a[i,2]<-obs[j,3]/2
  }
  
 tmp<-obs[5,3]/10
 a[9,1]<-tmp
 a[12,1]<-tmp

 for(i in c(10,13,15,17))
 {a[i,1]<-tmp
  a[i,2]<-tmp
 }


 #iteration

 n<-0                #iteration times
 swit<-1             #control the iteration
 opo<-0              #store the previously estimated probability
 oto<-0
 oro<-0
 odo<-0
 opr<-0
 opd<-0
 ope<-0
 opp<-0
 opn<-0
 ops<-0
 opsb<-0             #initial values for probabilities

while(swit==1 && n<=1000)
{#calculate probabilities
 npsum<-sum(a[1:7,])   #number of phone interview
 psum<-sum(a[8:17,])   #number of personal interview
 po<-psum/(npsum+psum) #p hat, use the classic method
 
 rape<-a[1,1]+a[8,1]+a[10,1]      #number of rapes that should have been reported when spous is present
 nrape<-a[2,1]+a[9,1]             #number of rapes not reported because of spouse present
 ro<-pseudo(rape,nrape,pro)       #r hat unweighted: 0.14387, use the pseudo Bayesian method

 dom<-a[3,1]+a[11,1]+a[13,1]      #number of domestic violence that should have been reported when spous is present
 ndom<-a[4,1]+a[12,1]             #number of domestic violences not reported because of spouse present
 doo<-pseudo(dom,ndom,pdoo)       #d hat unweighted: 0.06976028, use the pseudo Bayesian method

 c<-apply(a,1,sum)
 tel<-c[8]+c[11]+c[14]
 ntel<-c[10]+c[13]+c[15]          #number of crimes not reported because of telephone interview
 to<-pseudo(tel,ntel,pto)         #t hat unweighted: 0.5277342, use the pseudo Bayesian method

 s<-rep(0,2)                 #Assume the independence of w matrix, store the column probability
 p<-rep(0,5)                 #store the row probability
 w<-matrix(0,5,2)
 tot<-sum(a)
 s[1]<-sum(a[,1])/tot        #ps hat: probability that spouse is present during the interview, classic method
 s[2]<-sum(a[,2])/tot        #psb hat: probability that spouse is not present during the interview
 p[1]<-(c[1]+c[2]+c[8]+c[9]+c[10])/tot    #probability of rape, classic method
 p[2]<-(c[3]+c[4]+c[11]+c[12]+c[13])/tot  #probability of domestic violence
 p[3]<-(c[5]+c[14]+c[15])/tot             #probability of other assault
 p[4]<-(c[6]+c[16])/tot                   #probability of personal larceny
 p[5]<-(c[7]+c[17])/tot                   #probability of no crime reported
 for (i in 1:5)               #use the independence assumption for the cell probabilities
 for (j in 1:2)
 {w[i,j]<-p[i]*s[j]}          #w hat(i,j)

 #check convergence
 b<-sum(abs(opo-po),abs(oro-ro),abs(odo-doo),abs(oto-to),abs(opr-p[1]),abs(opd-p[2]),abs(ope-p[3]),abs(opp-p[4]),abs(opn-p[5]),abs(ops-s[1]))
 if (b<=0.0001)                #diagonose as converged when the sum of absolute difference is smaller than 0.0001, store the risk when convergence met.
 {swit<-0
  kro<-prisk(rape,nrape,pro)
  kdo<-prisk(dom,ndom,pdoo)
  kto<-prisk(tel,ntel,pto)
 }

 #store the estimated probabilities of this iteration for next iteration
 opo<-po
 oro<-ro
 odo<-doo
 oto<-to
 opr<-p[1]
 opd<-p[2]
 ope<-p[3]
 opp<-p[4]
 opn<-p[5]
 ops<-s[1]
 
 #obtain the updated complete data counts: E-step
 w1<-(1-po)*(1-ro)*w[1,1]
 w2<-(1-po)*(1-doo)*w[2,1]
 w3<-(1-po)*w[5,1]
 tmp<-obs[5,1]/(w1+w2+w3)
 a[2,1]<-tmp*w1
 a[4,1]<-tmp*w2
 a[7,1]<-tmp*w3

 w1<-po*to*ro*w[1,1]
 w2<-po*to*w[1,2]
 tmp<-obs[1,3]/(w1+w2)
 a[8,1]<-w1*tmp
 a[8,2]<-w2*tmp

 w1<-po*to*doo*w[2,1]
 w2<-po*to*w[2,2]
 tmp<-obs[2,3]/(w1+w2)
 a[11,1]<-w1*tmp
 a[11,2]<-w2*tmp

 w1<-po*to*w[3,1]
 w2<-po*to*w[3,2]
 tmp<-obs[3,3]/(w1+w2)
 a[14,1]<-w1*tmp
 a[14,2]<-w2*tmp

 w1<-po*w[4,1]
 w2<-po*w[4,2]
 tmp<-obs[4,3]/(w1+w2)
 a[16,1]<-w1*tmp
 a[16,2]<-w2*tmp

 w1<-po*(1-ro)*w[1,1]
 w2<-po*(1-to)*ro*w[1,1]
 w3<-po*(1-to)*w[1,2]
 w4<-po*(1-doo)*w[2,1]
 w5<-po*doo*(1-to)*w[2,1]
 w6<-po*(1-to)*w[2,2]
 w7<-po*(1-to)*w[3,1]
 w8<-po*(1-to)*w[3,2]
 w9<-po*w[5,1]
 w10<-po*w[5,2]
 tmp<-obs[5,3]/(w1+w2+w3+w4+w5+w6+w7+w8+w9+w10)
 a[9,1]<-tmp*w1
 a[10,1]<-tmp*w2
 a[10,2]<-tmp*w3
 a[12,1]<-tmp*w4
 a[13,1]<-tmp*w5
 a[13,2]<-tmp*w6
 a[15,1]<-tmp*w7
 a[15,2]<-tmp*w8
 a[17,1]<-tmp*w9
 a[17,2]<-tmp*w10

 n<-n+1 #count the iteration number
}

result<-list(n=n,w=w,rate=apply(w,1,sum)*1000,po=po,to=to,ro=ro,doo=doo,y=a,s=s,p=p,kro=kro,kdo=kdo,kto=kto)
result
}

###########################emperical-baysian estimator for binomial dist

pseudo<-function(x1,x2,p)
{N<-x1+x2
 k<-2*x1*x2/(x1^2+x2^2-2*N*(x1*p+x2-x2*p)+N^2*(p^2+(1-p)^2))
 phat<-N/(N+k)*(x1/N)+k/(N+k)*p
 phat
}

pseudo4<-function(x1,x2,x3,x4,p1,p2,p3,p4)
{N<-x1+x2+x3+x4
 k<-(N^2-x1^2-x2^2-x3^2-x4^2)/(x1^2+x2^2+x3^2+x4^2-2*N*(x1*p1+x2*p2+x3*p3+x4*p4)+N^2*(p1^2+p2^2+p3^2+p4^2))
 ph1<-N/(N+k)*(x1/N)+k/(N+k)*p1
 ph2<-N/(N+k)*(x2/N)+k/(N+k)*p2
 ph3<-N/(N+k)*(x3/N)+k/(N+k)*p3
 ph4<-1-ph1-ph2-ph3
 c(ph1,ph2,ph3,ph4)
}

prisk<-function(x1,x2,p)
{N<-x1+x2
 k<-2*x1*x2/(x1^2+x2^2-2*N*(x1*p+x2-x2*p)+N^2*(p^2+(1-p)^2))
 risk<-2*x1*x2/(N+k)^2+k^2/(N*(N+k)^2)*(x1^2+x2^2-2*N*(x1*p+x2-x2*p)+N^2*(p^2+(1-p)^2))
 c(risk,k,k/(N+k))
}

prisk4<-function(x1,x2,x3,x4,p1,p2,p3,p4)
{N<-x1+x2+x3+x4
 k<-(N^2-x1^2-x2^2-x3^2-x4^2)/(x1^2+x2^2+x3^2+x4^2-2*N*(x1*p1+x2*p2+x3*p3+x4*p4)+N^2*(p1^2+p2^2+p3^2+p4^2))
 risk<-(N^2-x1^2-x2^2-x3^2-x4^2)/(N+k)^2+k^2/(N*(N+k)^2)*(x1^2+x2^2+x3^2+x4^2-2*N*(x1*p1+x2*p2+x3*p3+x4*p4)+N^2*(p1^2+p2^2+p3^2+p4^2))
 c(risk,k,k/(N+k))
}

#example

bayesian_model(obs3,0.14387,0.06976028,0.5277342)    #the prior information is from unweighted data 1993-1997
bayesian_model(obs4,0.1399297,0.06491641,0.5347465)  #the prior information is from weighted data 1993-1997


################################################################################
#Jackknife method to calculate the variance of estimation    

npsu <-  28                        #number of psu

class_j_n<-rep(0,npsu)            #store the results of class_jic model
class_j_rate<-matrix(0,npsu,5)
class_j_po<-rep(0,npsu)
class_j_to<-rep(0,npsu)
class_j_ro<-rep(0,npsu)
class_j_doo<-rep(0,npsu)
class_j_s<-matrix(0,npsu,2)
class_j_p<-matrix(0,npsu,5)

bayes_j_n<-rep(0,npsu)            #store the results of Bayesian model
bayes_j_rate<-matrix(0,npsu,5)
bayes_j_po<-rep(0,npsu)
bayes_j_to<-rep(0,npsu)
bayes_j_ro<-rep(0,npsu)
bayes_j_doo<-rep(0,npsu)
bayes_j_s<-matrix(0,npsu,2)
bayes_j_p<-matrix(0,npsu,5)
bayes_j_kro<-matrix(0,npsu,3)
bayes_j_kdo<-matrix(0,npsu,3)
bayes_j_kto<-matrix(0,npsu,3)

w_class_j_n<-rep(0,npsu)            #store the results of class_jic model for weighted data
w_class_j_rate<-matrix(0,npsu,5)
w_class_j_po<-rep(0,npsu)
w_class_j_to<-rep(0,npsu)
w_class_j_ro<-rep(0,npsu)
w_class_j_doo<-rep(0,npsu)
w_class_j_s<-matrix(0,npsu,2)
w_class_j_p<-matrix(0,npsu,5)

w_bayes_j_n<-rep(0,npsu)            #store the results of bayes_jian model for weighted data
w_bayes_j_rate<-matrix(0,npsu,5)
w_bayes_j_po<-rep(0,npsu)
w_bayes_j_to<-rep(0,npsu)
w_bayes_j_ro<-rep(0,npsu)
w_bayes_j_doo<-rep(0,npsu)
w_bayes_j_s<-matrix(0,npsu,2)
w_bayes_j_p<-matrix(0,npsu,5)
w_bayes_j_kro<-matrix(0,npsu,3)
w_bayes_j_kdo<-matrix(0,npsu,3)
w_bayes_j_kto<-matrix(0,npsu,3)

temp<-1:28

for (u in 1:npsu)
{boots<-temp[-u]   # get the Jackknife data
 sum_temp<-matrix(0,7,6)
 sum_temp_w<-matrix(0,7,6)
 for (i in boots)
 {sum_temp<-sum_temp+ncvs_9804_unweighted[,,i]
  sum_temp_w<-sum_temp_w+ncvs_9804_weighted[,,i]
  }
 group_sum_temp_w<-matrix(0,4,6)
 group_sum_temp_w[1,]<-sum_temp_w[1,]
 group_sum_temp_w[2,]<-sum_temp_w[2,]+sum_temp_w[3,]
 group_sum_temp_w[3,]<-sum_temp_w[4,]+sum_temp_w[5,]
 group_sum_temp_w[4,]<-sum_temp_w[7,]
 dimnames(group_sum_temp_w)<-list(c("telephone interview", "spouse present", "spouse not present",  "all interviews"), c("#Interviews", "Rape", "Dom_Vio", "Other_Ass", "Per_Lar", "No"))
 group_sum_temp<-matrix(0,4,6)
 group_sum_temp[1,]<-sum_temp[1,]
 group_sum_temp[2,]<-sum_temp[2,]+sum_temp[3,]
 group_sum_temp[3,]<-sum_temp[4,]+sum_temp[5,]
 group_sum_temp[4,]<-sum_temp[7,]
 dimnames(group_sum_temp)<-list(c("telephone interview", "spouse present", "spouse not present",  "all interviews"), c("#Interviews", "Rape", "Dom_Vio", "Other_Ass", "Per_Lar", "No"))
 
 obs<-t(group_sum_temp)[2:6,c(2,3,1)] #work on the unweighted data
 
 result1<-classic_model(obs)
 class_j_n[u]<-result1$n            #store the results of classic model
 class_j_rate[u,]<-result1$rate
 class_j_po[u]<-result1$po
 class_j_to[u]<-result1$to
 class_j_ro[u]<-result1$ro
 class_j_doo[u]<-result1$doo
 class_j_s[u,]<-result1$s
 class_j_p[u,]<-result1$p
 
 result2<- bayesian_model(obs,0.14387,0.06976028,0.5277342)
 bayes_j_n[u]<-result2$n          #store the results of Bayesian model
 bayes_j_rate[u,]<-result2$rate
 bayes_j_po[u]<-result2$po
 bayes_j_to[u]<-result2$to
 bayes_j_ro[u]<-result2$ro
 bayes_j_doo[u]<-result2$doo
 bayes_j_s[u,]<-result2$s
 bayes_j_p[u,]<-result2$p
 bayes_j_kro[u,]<-result2$kro
 bayes_j_kdo[u,]<-result2$kdo
 bayes_j_kto[u,]<-result2$kto

 obs<-t(group_sum_temp_w)[2:6,c(2,3,1)] #work on the weighted data
 
 result1<-classic_model(obs)
 w_class_j_n[u]<-result1$n            #store the results of classic model
 w_class_j_rate[u,]<-result1$rate
 w_class_j_po[u]<-result1$po
 w_class_j_to[u]<-result1$to
 w_class_j_ro[u]<-result1$ro
 w_class_j_doo[u]<-result1$doo
 w_class_j_s[u,]<-result1$s
 w_class_j_p[u,]<-result1$p
 
 result2<- bayesian_model(obs,0.1407742,0.06924947,0.5345764)
 w_bayes_j_n[u]<-result2$n           #store the results of Bayesian model
 w_bayes_j_rate[u,]<-result2$rate
 w_bayes_j_po[u]<-result2$po
 w_bayes_j_to[u]<-result2$to
 w_bayes_j_ro[u]<-result2$ro
 w_bayes_j_doo[u]<-result2$doo
 w_bayes_j_s[u,]<-result2$s
 w_bayes_j_p[u,]<-result2$p
 w_bayes_j_kro[u,]<-result2$kro
 w_bayes_j_kdo[u,]<-result2$kdo
 w_bayes_j_kto[u,]<-result2$kto
}