Install the R-package from CRAN using the
install.packages
function:
Load the R package using the library()
function in R as
below:
The functions available in the package can be readily used on real data. But in absence of that, we shall first create a synthetic dataset and demonstrate the usage of the available functions on the said dataset here.
Say, we want to apply our method to a presence-absence of disease
data in the counties of Iowa. First, we need to create the adjacency
matrix for the counties in Iowa. To do so, we shall use the datasets
county.adjacency
and USAcities
provided in the
package as below:
data(county.adjacency)
data(USAcities)
library(dplyr)
library(gt)
library(gtsummary)
IAcities <- USAcities %>% filter(state_id=="IA")
countyname <- unique(IAcities$county_name)
A <- get_adj_mat(county.adjacency,countyname,c("IA"))
m <- apply(A,1,sum)
Next, we will set some parameters up:
set.seed(091017)
n <- nrow(A) # Number of spatial units
nis <- rep(16,n) # Number of individuals per county; here it's balanced -- 12 per county per year
# Note: may need to lower proposal variance, s, below as n_i increases
sid <- rep(1:n,nis)
tid <- rep(1:nis[1],n)
N <- length(sid) # Total number of observations
Next, we generate the spatial effects:
library(spam)
# library(mvtnorm)
kappa <- 0.99 # Spatial dependence parameter ~ 1 for intrinsic CAR
covm <- matrix(c(.5,.10,.10,-.10,
.10,.15,.10,.10,
.10,.10,.5,.10,
-.10,.10,.10,.15),4,4)# Conditional Cov of phi1 and phi2 given phi_{-i}
Q <- as.spam(diag(m)-kappa*A)
covphi <- solve(Q)%x%covm # 3n x 3n covariance of phis
phi <- rmvnorm(1,sigma=covphi) # 3n vector of spatial effects
phitmp <- matrix(phi,ncol=4,byrow=T) # n x 3 matrix of spatial effects
true.phi1 <- phi1 <- phitmp[,1]-mean(phitmp[,1]) # n x 1 phi1 vector -- Centered
true.phi2 <- phi2 <- phitmp[,2]-mean(phitmp[,2]) # n x 1 phi2 vector, etc.
true.phi3 <- phi3 <- phitmp[,3]-mean(phitmp[,3])
true.phi4 <- phi4 <- phitmp[,4]-mean(phitmp[,4])
Phi1 <- rep(phi1,nis)
Phi2 <- rep(phi2,nis)
Phi3 <- rep(phi3,nis)
Phi4 <- rep(phi4,nis)
true.phi <- cbind(true.phi1,true.phi2,true.phi3,true.phi4)
Followed by the fixed effects and the covariates X
:
library(boot)
tpop <- USAcities %>% filter(state_id=="IA") %>% group_by(county_fips) %>% summarise(tpop=population) %>% .[,2] %>% unlist %>% as.numeric
x <- rep(log(tpop),nis)
x <- (x-mean(x))/sd(x)
# set.seed(2023); x = rnorm(N)
t <- tid / max(tid)
X <- cbind(1,x) # Design matrix, can add additional covariates (e.g., race, age, gender)
Xtilde <- cbind(1,x,sin(t))
p <- ncol(Xtilde)
Next, we set up the binomial and the count parts of the data:
# Binomial part
true.alpha <- rep(0.1,p)
true.alpha[1:2] <- c(-0.25,0.25)
# true.alpha <- c(-0.25,0.25,-0.5,0.25)
eta1 <- Xtilde%*%true.alpha+Phi1+Phi2*t
pii <- inv.logit(eta1) # 1-pr("structural zero")
u <- rbinom(N,1,pii) # at-risk indicator
N1 <- sum(u)
pstruct0 <- 1-mean(u) # Proportion of structural zeros
# Count Part
true.beta <- rep(0.1,p)
true.beta[1:2] <- c(.5,-.25)
tmp <- u
tmp[u==0] <- 0 # If in structural group then set tmp to 0
nis1 <- tapply(tmp,sid,sum) # Number of at risk observations in each county
eta2 <- Xtilde[u==1,]%*%true.beta+Phi3[u==1]+Phi4[u==1]*t[u==1] # Linear predictor for count part
true.r <- 1.25 # NB dispersion
psi <- exp(eta2)/(1+exp(eta2)) # Prob of success
mu <- true.r*psi/(1-psi) # NB mean
Finally, we generate the data y
and make it into a data
frame.
y <- rep(0,N) # Response
y[u==1] <- rnbinom(N1,size=true.r,mu=mu)# If at risk, draw from NB
pzero <- length(y[y==0])/N # Proportion of zeros
# pzero
# pstruct0/pzero
# table(y)
# mean(mu)
# quantile(tapply(u,id,sum)) # Number of at-risk observations per county *for all 5 years*
# quantile(y)
this_dat <- data.frame(sid,tid,y,X)
head(this_dat)
#> sid tid y V1 x
#> 1 1 1 1 1 -0.9924003
#> 2 1 2 0 1 -0.9924003
#> 3 1 3 0 1 -0.9924003
#> 4 1 4 3 1 -0.9924003
#> 5 1 5 0 1 -0.9924003
#> 6 1 6 0 1 -0.9924003
tbl_summary(this_dat) %>% modify_header(label = "**Coefficients**")
Coefficients | N = 1,5841 |
---|---|
sid | 50 (25, 75) |
tid | 9 (5, 13) |
y | 0.00 (0.00, 1.00) |
V1 | |
1 | 1,584 (100%) |
x | -0.20 (-0.68, 0.38) |
1 Median (Q1, Q3); n (%) |
Once we have generated the data, we shall now use the functions available in our package on this data.
First, we’ll focus on summarizing and visualizing the data by producing spatial maps and other plots and summaries of the data. The following code generates a spatial map of cumulative deaths in the state of Iowa:
We can additionally make histograms and spaghetti plots and Moran’s I:
# blue histogram
tmp <- table(y)/N*100 # convert to %s (divide by N multiply by 100)
oldpar <- par(no.readonly = TRUE)
par(mar=c(3,3,1,1))
barplot(tmp, ylab="Percent",xlab="Count",col="lightblue")
# sphagetti plot
library(CorrMixed)
# Plot individual profiles + mean
Spaghetti.Plot(Dataset = this_dat, Outcome = y, Id=sid, Time = tid)
par(oldpar)
# Moran's I
library(ape)
library(reshape)
this_dat2 <- cast(this_dat,sid~tid,sum,value="y")
Moran.I(rowMeans(this_dat2),A)
#> $observed
#> [1] 0.2704696
#>
#> $expected
#> [1] -0.01020408
#>
#> $sd
#> [1] 0.05190279
#>
#> $p.value
#> [1] 6.384654e-08
this_dat2 %>% apply(2,function(w) Moran.I(w,A)) %>% lapply(function(list) list$p.value) %>% unlist
#> 1 2 3 4 5 6
#> 3.490479e-02 3.369633e-02 5.166453e-02 4.666249e-06 5.565006e-02 5.294755e-08
#> 7 8 9 10 11 12
#> 3.387397e-02 4.195429e-02 4.094047e-05 5.561652e-02 9.834662e-04 5.172957e-05
#> 13 14 15 16
#> 1.770772e-03 1.867974e-01 2.719572e-04 3.982139e-05
Moran.I(rowMeans(this_dat2),A)
#> $observed
#> [1] 0.2704696
#>
#> $expected
#> [1] -0.01020408
#>
#> $sd
#> [1] 0.05190279
#>
#> $p.value
#> [1] 6.384654e-08
We can visualize the death count at given times by using the
tsel
option:
USDmapCount(state.sel = c("IA"),
dat = this_dat,
scol = 1,
tcol = 2, tsel = 3,
cname = countyname) ## Timepoint 3
USDmapCount(state.sel = c("IA"),
dat = this_dat,
scol = 1,
tcol = 2, tsel = 6,
cname = countyname) ## Timepoint 6
USDmapCount(state.sel = c("IA"),
dat = this_dat,
scol = 1,
tcol = 2, tsel = 9,
cname = countyname) ## Timepoint 9
USDmapCount(state.sel = c("IA"),
dat = this_dat,
scol = 1,
tcol = 2, tsel = 12,
cname = countyname) ## Timepoint 12
Now, we shall apply the different models on this data. We’ll start by
fitting the Bayesian Spatiotemporal Poisson (BSTP) model using the
functions BNB
and BZINB
:
res0 <- BNB(y, X, A, nchain=2, niter=50, nburn=10)
glimpse(res0)
#> List of 4
#> $ Alpha: NULL
#> $ Beta : num [1:40, 1:2, 1:2] 0.181 0.158 0.179 0.202 0.194 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:2] "" "x"
#> .. ..$ : NULL
#> $ R : num [1:40, 1:2] 0.908 0.898 0.911 0.907 0.907 ...
#> $ Eta1 : num [1:40, 1:1584, 1:2] 0.196 0.282 0.259 0.26 0.237 ...
res1 <- BZINB(y, X, A, nchain=2, niter=50, nburn=10)
glimpse(res1)
#> List of 6
#> $ Alpha: num [1:40, 1:2, 1:2] -0.31 -0.374 -0.254 -0.172 -0.307 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:2] "" "x"
#> .. ..$ : NULL
#> $ Beta : num [1:40, 1:2, 1:2] 0.983 0.993 1.018 0.868 0.899 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:2] "" "x"
#> .. ..$ : NULL
#> $ R : num [1:40, 1:2] 0.964 0.961 0.96 0.978 0.969 ...
#> $ Eta1 : num [1:40, 1:1584, 1:2] -0.635 -0.573 -0.299 -0.235 -0.468 ...
#> $ Eta2 : num [1:40, 1:1584, 1:2] 1.213 1.224 1.161 1.187 0.983 ...
#> $ I : num [1:40, 1:1584, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
apply(res0$Beta,2,mean)
#> x
#> 0.27111982 -0.08627159
apply(res1$Beta,2,mean)
#> x
#> 0.9038659 -0.2531115
Next will be the Bayesian Spatiotemporal Negative Binomial (BSTNB)
model using the function BSTNB
:
res2 <- BSTNB(y, X, A, nchain=2, niter=50, nburn=10)
glimpse(res2)
#> List of 7
#> $ Alpha : NULL
#> $ Beta : num [1:40, 1:2, 1:2] -0.01044 0.00529 -0.03833 -0.09124 -0.05431 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:2] "" "x"
#> .. ..$ : NULL
#> $ R : num [1:40, 1:2] 0.978 0.955 0.94 0.94 0.926 ...
#> $ Sigphi: num [1:40, 1:4, 1:2] 0.991 1.231 1.651 1.739 1.857 ...
#> $ PHI3 : num [1:40, 1:99, 1:2] -0.0634 -0.1443 0.0883 -0.4962 0.0432 ...
#> $ PHI4 : num [1:40, 1:99, 1:2] 0.1692 0.1272 -0.0977 0.0804 0.3663 ...
#> $ Eta1 : num [1:40, 1:1584, 1:2] 0.176 0.164 0.046 0.128 -0.479 ...
Followed by a Bayesian Spatiotemporal Zero Inflated Negative Binomial
(BSTZINB) model with linear temporal trend using the
BSTZINB
function with option LinearT
set to
TRUE
. We can check the convergence of different parameters
for this model using the conv.test
function supplied with
the package. DIC can also be computed using the supplied
compute_ZINB_DIC
function.
res3 <- BSTZINB(y, X, A, LinearT=TRUE, nchain=2, niter=50, nburn=10, nthin=1)
glimpse(res3)
#> List of 11
#> $ Alpha : num [1:40, 1:3, 1:2] -0.459 -0.286 -0.407 -0.53 -0.546 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:3] "" "x" "t"
#> .. ..$ : NULL
#> $ Beta : num [1:40, 1:3, 1:2] 0.861 1.027 0.909 0.753 0.92 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:3] "" "x" "t"
#> .. ..$ : NULL
#> $ R : num [1:40, 1:2] 1.001 1.011 0.964 0.957 0.947 ...
#> $ Sigphi: num [1:40, 1:16, 1:2] 0.22 0.237 0.315 0.333 0.234 ...
#> $ PHI1 : num [1:40, 1:99, 1:2] -0.174 -0.349 -0.328 -0.762 -0.28 ...
#> $ PHI2 : num [1:40, 1:99, 1:2] -0.4255 -0.3748 -0.4776 0.2998 -0.0594 ...
#> $ PHI3 : num [1:40, 1:99, 1:2] -0.0521 -0.0541 -0.068 -0.3866 0.0741 ...
#> $ PHI4 : num [1:40, 1:99, 1:2] 0.0713 0.6015 0.5084 0.3225 0.1051 ...
#> $ Eta1 : num [1:40, 1:1584, 1:2] -0.765 -0.823 -0.857 -1.261 -0.907 ...
#> $ Eta2 : num [1:40, 1:1584, 1:2] 0.999 1.034 1.348 1.111 0.628 ...
#> $ I : num [1:40, 1:1584, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
conv.test(res3$Alpha) ## Convergence test for Alpha parameters
#> [1] TRUE FALSE FALSE
conv.test(res3$Beta) ## Convergence test for Beta parameters
#> [1] FALSE FALSE FALSE
conv.test(res3$R) ## Convergence test for R parameters
#> [1] FALSE
## Plotting the chains for the parameters
oldpar <- par(no.readonly = TRUE)
par(mfrow=c(1,2));plot(res3$R[,1],main="R");plot(res3$R[,2],main="R")
par(mfrow=c(1,2));plot(res3$Alpha[,1,1],type='o',main="Alpha");plot(res3$Alpha[,1,2],type='o',main="Alpha")
par(oldpar)
## Checking the estimated beta with the true beta
apply(res3$Beta,2,mean)
#> x t
#> 0.9382654 -0.2383876 -0.1371745
true.beta
#> [1] 0.50 -0.25 0.10
## Computing DIC
compute_ZINB_DIC(y,res3,dim(res3$Beta)[1],2)
#> [1] 3793.11
And, another BSTZINB model but with spline fitted non-linear temporal
trend is fitted below by switching the LinearT
to
FALSE
. Corresponding convergence tests, plots and DIC
computation are also presented. We can compare the two DICs to pick the
best fit model.
res4 <- BSTZINB(y, X, A, LinearT=FALSE, nchain=2, niter=50, nburn=10)
glimpse(res4)
#> List of 11
#> $ Alpha : num [1:40, 1:9, 1:2] 0.0257 -0.4586 -0.4462 -0.841 -0.5108 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:9] "" "x" "t1" "t2" ...
#> .. ..$ : NULL
#> $ Beta : num [1:40, 1:9, 1:2] 0.853 0.995 0.787 0.677 0.94 ...
#> ..- attr(*, "dimnames")=List of 3
#> .. ..$ : NULL
#> .. ..$ : chr [1:9] "" "x" "t1" "t2" ...
#> .. ..$ : NULL
#> $ R : num [1:40, 1:2] 1.04 1.04 1.04 1.03 1.06 ...
#> $ Sigphi: num [1:40, 1:16, 1:2] 0.197 0.245 0.333 0.309 0.362 ...
#> $ PHI1 : num [1:40, 1:99, 1:2] -0.0493 -0.029 -0.2149 0.1445 -0.3777 ...
#> $ PHI2 : num [1:40, 1:99, 1:2] -0.3427 0.2001 0.1584 0.1416 0.0209 ...
#> $ PHI3 : num [1:40, 1:99, 1:2] 0.0812 -0.1302 -0.0414 -0.0613 0.1721 ...
#> $ PHI4 : num [1:40, 1:99, 1:2] 0.0294 0.1806 -0.2608 0.3836 -0.274 ...
#> $ Eta1 : num [1:40, 1:1584, 1:2] -0.267 -0.656 -0.662 -0.876 -1.108 ...
#> $ Eta2 : num [1:40, 1:1584, 1:2] 0.909 1.134 1.118 0.895 0.824 ...
#> $ I : num [1:40, 1:1584, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
## Convergence tests
conv.test(res4$Alpha) ## Alpha parameters
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
conv.test(res4$Beta) ## Beta parameters
#> [1] FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
conv.test(res4$R) ## R parameters
#> [1] FALSE
oldpar <- par(no.readonly = TRUE)
par(mfrow=c(1,2));plot(res4$R[,1],main="R");plot(res4$R[,2],main="R")
par(mfrow=c(1,2));plot(res4$Alpha[,1,1],type='o',main="Alpha");plot(res4$Alpha[,1,2],type='o',main="Alpha")
par(oldpar)
## Posterior means for beta and R
apply(res4$Beta,2,mean)
#> x t1 t2 t3 t4
#> 0.86358591 -0.23717822 0.11512622 -0.04913811 0.01639619 -0.01699795
#> t5 t6 t7
#> -0.37292695 -0.06920403 0.10778527
mean(res4$R)
#> [1] 1.029068
## DIC
compute_ZINB_DIC(y,res4,dim(res4$Beta)[1],2)
#> [1] 3984.363
These results may be summarized using the supplied
ResultTableSummary
function. Here we summarize the output
of the BSTZINB model with the linear time trend:
Coefficients | Estimates1 |
---|---|
a. | -0.504 (-0.790, -0.372) |
a.x | 0.189 (0.078, 0.285) |
a.t | 0.629 (0.364, 0.962) |
b. | 0.938 (0.802, 1.121) |
b.x | -0.238 (-0.335, -0.118) |
b.t | -0.137 (-0.402, 0.061) |
1 Point estimates (90% credible intervals) |
A more detailed summary can be obtained by using the function
ResultTableSummary2
. This functions takes the data
(y
, X
, A
, nt
) as
input, runs all of the above 4 models, computes DICs and checks
convergences, and finally summarizes them:
Coefficients | BNB1 | BZINB1 | BSTNB1 | BSTZINB1 |
---|---|---|---|---|
a.t | ||||
a. | -0.237 (-0.265, -0.191) | -0.121 (-0.185, -0.021) | ||
a.x | 0.299 (0.278, 0.323) | 0.267 (0.214, 0.303) | ||
a.t1 | -0.589 (-0.947, -0.242) | |||
a.t2 | 0.125 (-0.132, 0.452) | |||
a.t3 | -0.751 (-1.015, -0.539) | |||
a.t4 | 0.479 (0.223, 0.615) | |||
a.t5 | -0.453 (-0.710, -0.183) | |||
a.t6 | 1.124 (0.866, 1.376) | |||
a.t7 | 0.030 (-0.200, 0.231) | |||
b.t | ||||
b. | 0.323 (0.243, 0.362) | 0.848 (0.827, 0.886) | 0.039 (-0.031, 0.117) | 0.960 (0.835, 1.054) |
b.x | -0.086 (-0.092, -0.072) | -0.259 (-0.277, -0.233) | -0.085 (-0.115, -0.052) | -0.259 (-0.292, -0.226) |
b.t1 | 0.074 (-0.212, 0.249) | |||
b.t2 | -0.056 (-0.303, 0.252) | |||
b.t3 | -0.158 (-0.318, 0.002) | |||
b.t4 | 0.104 (-0.087, 0.194) | |||
b.t5 | -0.617 (-0.801, -0.389) | |||
b.t6 | -0.002 (-0.170, 0.147) | |||
b.t7 | 0.054 (-0.075, 0.133) | |||
1 Point estimates (90% credible intervals) DIC1 = 4444 ; DIC2 = 3995.1 ; DIC3 = 4293.9 ; DIC4 = 4059.6 |
Next, we’ll focus on the outputs of these models and visualizing them. First, we’ll visualize a time-averaged Eta parameter in the following way:
# Map Visualization : Time-averaged Eta
this_result_dat <- this_dat
this_result_dat$y <- apply(res4$Eta1,2,mean)
this_result_dat %>% head
#> sid tid y V1 x
#> 1 1 1 -0.5346649 1 -0.9924003
#> 2 1 2 -0.8728910 1 -0.9924003
#> 3 1 3 -0.8755784 1 -0.9924003
#> 4 1 4 -0.8098116 1 -0.9924003
#> 5 1 5 -0.8740858 1 -0.9924003
#> 6 1 6 -0.9925393 1 -0.9924003
USDmapCount(state.sel = c("IA"),
dat = this_result_dat,
scol = 1,
cname = countyname)
Next, we visualize random and fixed effects of Eta:
# Map Visualization : Spatio-temporal Random Effect of Eta
this_result_dat <- this_dat
Phi1 <- rep(apply(res4$PHI1,2,mean),nis)
Phi2 <- rep(apply(res4$PHI2,2,mean),nis)
tt <- this_dat$tid / max(this_dat$tid)
this_result_dat$y <- Phi1+Phi2*tt
USDmapCount(state.sel = c("IA"),
dat = this_result_dat,
scol = 1,
tcol = 2,
cname = countyname)
# Map Visualization : Temporal Fixed Effect of Eta
this_result_dat <- this_dat
Phi1 <- rep(apply(res4$PHI1,2,mean),nis)
Phi2 <- rep(apply(res4$PHI2,2,mean),nis)
tt <- this_dat$tid / max(this_dat$tid)
this_result_dat$y <- apply(res4$Eta1,2,mean) - Phi1+Phi2*tt
USDmapCount(state.sel = c("IA"),
dat = this_result_dat,
scol = 1,
tcol = 2,
cname = countyname)
We can also map the time-averaged and time-specified probability of at-risk of disease:
# Map Visualization : Time-averaged probability at risk of disease
this_result_dat <- this_dat
this_result_dat$y <- inv.logit(apply(res4$Eta1,2,mean))
this_result_dat %>% head
#> sid tid y V1 x
#> 1 1 1 0.3694295 1 -0.9924003
#> 2 1 2 0.2946531 1 -0.9924003
#> 3 1 3 0.2940949 1 -0.9924003
#> 4 1 4 0.3079306 1 -0.9924003
#> 5 1 5 0.2944048 1 -0.9924003
#> 6 1 6 0.2704108 1 -0.9924003
USDmapCount(state.sel = c("IA"),
dat = this_result_dat,
scol = 1,
tcol = 2,
cname = countyname)
# Map Visualization : Time-specified probability at risk of disease
this_result_dat <- this_dat
this_result_dat$y <- inv.logit(apply(res4$Eta1,2,mean))
this_result_dat %>% head
#> sid tid y V1 x
#> 1 1 1 0.3694295 1 -0.9924003
#> 2 1 2 0.2946531 1 -0.9924003
#> 3 1 3 0.2940949 1 -0.9924003
#> 4 1 4 0.3079306 1 -0.9924003
#> 5 1 5 0.2944048 1 -0.9924003
#> 6 1 6 0.2704108 1 -0.9924003
USDmapCount(state.sel = c("IA"),
dat = this_result_dat,
scol = 1,
tcol = 2,
tsel = 1,
cname = countyname)
Further summarization of the output can be done by plotting the
probabilities of the top few counties by quantiles and by numbers using
qRankPar
and qRankParTop
functions:
# Bar Plot of vn-quantile Probability at Risk of disease
qRankPar(state.set=c("IA"),cname=countyname,bstfit=res3,vn=12,
cex.title=18, cex.lab=12, cex.legend=12)
# Bar Plot of Top vn number of Probability at Risk of disease
qRankParTop(state.set=c("IA"),cname=countyname,bstfit=res3,vn=12,
cex.title=18, cex.lab=12, cex.legend=12)
The non-linear time effect for abundance for BSTZINB outputs
(non-smoothed and smoothed versions) can be plotted using the
TimetrendCurve
function: