### Example using The Euclidean distance on a complete database
# For this example we keep only 200 rows:
data(tab_test)
tab_test2 <- tab_test[c(1:80, 5001:5080), ]
dim(tab_test2)
# Adding NAs in Y1 and Y2
tab_test2[tab_test2$ident == 2, 2] <- NA
tab_test2[tab_test2$ident == 1, 3] <- NA
# Because all covariates are ordered in numeric form,
# the transfo_dist function is not required here
mat_testm <- proxim_dist(tab_test2, norm = "M")
### Y(Y1) and Z(Y2) are a same variable encoded in 2 different forms:
### 4 levels for Y1 and 3 levels for Y2
### ... Stored in two distinct databases, A and B, respectively
### The marginal distribution of Y in B is unknown,
### as the marginal distribution of Z in A ...
# Assuming that the following matrix called transport symbolizes
# an estimation of the joint distribution L(Y,Z) ...
# Note that, in reality this distribution is UNKNOWN and is
# estimated in the OT function by resolving the optimization problem.
# By supposing:
val_trans <- c(0.275, 0.115, 0, 0, 0, 0.085, 0.165, 0, 0, 0, 0.095, 0.265)
mat_trans <- matrix(val_trans, ncol = 3, byrow = FALSE)
# Getting the individual predictions of Z in A (only)
# by computing average distances on 90% of the nearest neighbors of
# each modality of Z in B
predopt_A <- indiv_grp_optimal(mat_testm,
jointprobaA = mat_trans,
jointprobaB = mat_trans, percent_closest = 0.90,
which.DB = "A"
)
# \donttest{
### Example 2 using The Manhattan distance with incomplete covariates
data(simu_data)
man1 <- transfo_dist(simu_data,
quanti = c(3, 8), nominal = c(1, 4:5, 7),
ordinal = c(2, 6), logic = NULL, prep_choice = "M"
)
mat_man1 <- proxim_dist(man1, norm = "M")
### Y and Z are a same variable encoded in 2 different forms:
### (3 levels for Y and 5 levels for Z)
### ... Stored in two distinct databases, A and B, respectively
### The marginal distribution of Y in B is unknown,
### as the marginal distribution of Z in A ...
# By supposing that the following matrix called transport symbolizes
# an estimation of the joint distribution L(Y,Z) ...
# Note that, in reality this distribution is UNKNOWN and is
# estimated in the OT function by resolving an optimisation problem.
mat_trans2 <- matrix(c(0.3625, 0, 0, 0.07083333, 0.05666667,
0, 0, 0.0875, 0, 0, 0.1075, 0,
0, 0.17166667, 0.1433333),
ncol = 5, byrow = FALSE)
# The predicted values of Y in database B and Z in
# database A are stored in the following object:
predopt2 <- indiv_grp_optimal(mat_man1,
jointprobaA = mat_trans2,
jointprobaB = mat_trans2,
percent_closest = 0.90
)
summary(predopt2)
# }
Run the code above in your browser using DataLab