# This example illustrates the usefulness of the preprocess function.
# first network: nodes a to j present
mat1 <- rbinom(100, 1, 0.1)
mat1 <- matrix(mat1, nrow = 10) # has 10 nodes
rownames(mat1) <- letters[1:10]
colnames(mat1) <- letters[1:10]
# second network: nodes c to n present
mat2 <- rbinom(144, 1, 0.1)
mat2 <- matrix(mat2, nrow = 12) # has 12 nodes
rownames(mat2) <- letters[3:14]
colnames(mat2) <- letters[3:14]
# third network: nodes a and d to k present
mat3 <- rbinom(81, 1, 0.1)
mat3 <- matrix(mat3, nrow = 9) # has 9 nodes
rownames(mat3) <- letters[c(1, 4:11)]
colnames(mat3) <- letters[c(1, 4:11)]
# fourth network: same as second matrix
mat4 <- mat2
networks <- list(mat1, mat2, mat3, mat4)
# btergm without cross-temporal dependencies:
model.1 <- btergm(networks ~ edges + mutual)
summary(model.1)
# When cross-temporal dependencies are specified, the dimensions
# of the matrices do not match. This would cause a problem for btergm:
\dontrun{
btergm(networks[2:4] ~ edges + mutual + edgecov(networks[1:3])) # ERROR!
}
# This is because the first network in the dependent network and the
# first network in the lagged covariate are expected to have the same
# dimensions (and also at the second and third time step, of course).
# Therefore, missing nodes in the covariate (here: {k, l, m, n} at t=1,
# {a} at t=2, and {c, l, m, n} at t=3) must be removed from the
# dependent network at t=2, t=3 and t=4 as well:
dep <- preprocess(networks, lag = TRUE, covariate = FALSE)
# This reduces the size of dep from 12 to 8 at t=2, from 9 to 8 at
# t=3, and from 12 to 8 at t=4, and it removes the first network from
# the list. Moreover, some nodes are present in the lagged covariate
# but not in the dependent network (that is, at the next time step).
# Therefore, node sets {a, b}, {c, l, m, n}, and {a} must be removed
# from the lagged covariate at t=1, t=2, and t=3, respectively, to make
# the dimensions compatible:
lag <- preprocess(networks, lag = TRUE, covariate = TRUE)
# To compare the dimensions of the original versus preprocessed
# dependent networks and covariates, try the following code:
cbind(
"original_dep" = lapply(networks[2:4], nrow),
"original_lag" = lapply(networks[1:3], nrow),
"new_dep" = lapply(dep, nrow),
"new_lag" = lapply(lag, nrow)
)
# The dependent networks were reduced from 12, 9 and 12 to 8, 8 and
# 8 nodes, and the lagged networks were reduced from 10, 12 and 9 to
# 8, 8 and 8 nodes, respectively. The lagged node sets are now
# compatible. To see this:
cbind(rownames(dep[[1]]), rownames(lag[[1]]))
cbind(rownames(dep[[2]]), rownames(lag[[2]]))
cbind(rownames(dep[[3]]), rownames(lag[[3]]))
# Note, however, that the composition still changes within each list
# across some of the time steps:
cbind(rownames(dep[[1]]), rownames(dep[[2]]), rownames(dep[[3]]))
cbind(rownames(lag[[1]]), rownames(lag[[2]]), rownames(lag[[3]]))
# We can now use the btergm function on the preprocessed lists:
model.2 <- btergm(dep ~ edges + mutual + edgecov(lag))
summary(model.2)
# The model can now be estimated because the current and lagged networks
# have the same node sets at each time step. The disadvantage of this
# approach is that some observations are lost. The advantage, however,
# is that cross-temporal theories can be tested.
# However, since the node sets still differ across time steps, ROC and
# PR curves cannot be estimated. This is true because a simulation from
# nodes {c ... j} cannot be compared to a target network with nodes
# {d ... k}. Therefore, the following command would compare the wrong
# sets of nodes to estimate prediction performance:
\dontrun{
gof.2 <- gof(model.2, classicgof = FALSE, rocprgof = TRUE) # PROBLEM!
}
# To solve this problem, the most obvious approach is to estimate the
# model at earlier time steps and compute the out-of-sample predictive
# performance only for the last network:
model.3 <- btergm(dep[1:2] ~ edges + mutual + edgecov(lag[1:2]))
gof.3 <- gof(model.3, target = dep[[3]], formula = dep[[3]] ~ edges +
mutual + edgecov(lag[[3]]), classicgof = FALSE, rocprgof = TRUE)
# This models time steps 2 and 3 as a function of the lagged network
# at time steps 1 and 2, uses the resulting coefficients to predict
# the network at time step 4, and compares network 4 to simulations
# based on the coefficients from the previous time steps and the
# lagged network at the third time step. As the matrices within the
# third list item have identical node sets, predictive performance
# could be computed. The resulting ROC and PR curves can be plotted
# as follows:
plot(gof.3, boxplot = FALSE, pr = FALSE, roc.random = TRUE,
ylab = "TPR/PPV", xlab = "FPR/TPR", roc.main = "ROC and PR")
plot(gof.3, boxplot = FALSE, roc = FALSE, pr.random = TRUE,
rocpr.add = TRUE)
legend("right", legend = c("ROC", "ROC random graph", "PR",
"PR random graph"), col = c("#bd0017", "#bd001744", "#5886be",
"#5886be44"), lty = 1, lwd = 3)
# For another example with real-world data, see vignette("knecht")
Run the code above in your browser using DataLab