######################################
### Example using the classical ###
### iris data set as a toy example ###
######################################
data(iris)
# Import the iris dataset
versicolor_data=iris[iris$Species=="versicolor",]
# Select only the specimens belonging to the species Iris versicolor
versicolor_sepal=versicolor_data[,grep("Sepal",
colnames(versicolor_data))]
versicolor_petal=versicolor_data[,grep("Petal",
colnames(versicolor_data))]
# Separate sepal and petal data for I. versicolor
PLS_sepal_petal_versicolor=pls(versicolor_sepal,
versicolor_petal,
perm=99)
summary(PLS_sepal_petal_versicolor)
# Compute the PLS for I. versicolor
plot(PLS_sepal_petal_versicolor$XScores[,1],
PLS_sepal_petal_versicolor$YScores[,1],
asp = 1,
xlab = "PLS 1 Block 1 scores",
ylab = "PLS 1 Block 2 scores")
# Plot the scores for the original data on the first pair of PLS
# axes (one axis per block)
# This is the data based on which we will compute the major axis
# direction
# Imagine fitting a line through those point, that is the major axis
Pred_major_axis_versicolor=pls_major_axis(PLS_sepal_petal_versicolor,
axes_to_use=1)
# Compute for I. versicolor the projections to the major axis
# using only the first pair of PLS axes (and scaling the scores
# prior to the computation)
hist(Pred_major_axis_versicolor$original_major_axis_projection[[1]]$original_data_PLS_projection,
main="Original data - projections on the major axis - based on the first pair of PLS axes",
xlab="Major axis score")
# Plot distribution of PLS scores for each individual in the
# original data (I. versicolor)
# projected on the major axis for the first pair of PLS axis
Pred_major_axis_versicolor$original_major_axis_predictions_reversed$Block1
Pred_major_axis_versicolor$original_major_axis_predictions_reversed$Block2
# Shape for each individual of the original data (I. versicolor)
# predicted by its position along the major axis
# Now we will use the data from new species (I. setosa and I
# virginica) and obtain predictions from the PLS model obtained for
# I. versicolor
# The easiest is to use the data for all three species
# as if they were both new data
# (using versicolor as new data is not going to affect the model)
all_sepal=iris[,grep("Sepal", colnames(iris))]
all_petal=iris[,grep("Petal", colnames(iris))]
# Separate sepals and petals (they are the two blocks)
Pred_major_axis_versicolor_newdata=pls_major_axis(
pls_object=PLS_sepal_petal_versicolor,
new_data_x = all_sepal,
new_data_y = all_petal,
axes_to_use=1)
# Perform the major axis computation using new data
# Notice that:
# - we are using the old PLS model (computed on versicolor only)
# - we are adding the new data in the same order as in the original
# model (i.e., new_data_x is sepal data, new_data_y is petal data)
plot(Pred_major_axis_versicolor_newdata$new_data_results$new_data_Xscores[,1],
Pred_major_axis_versicolor_newdata$new_data_results$new_data_Yscores[,1],
col=iris$Species, asp=1,
xlab = "Old PLS, Axis 1, Block 1",
ylab = "Old PLS, Axis 1, Block 2")
# Plot the new data (both versicolor and setosa)
# in the space of the first pair of PLS axes computed only on
# versicolor
# The three species follow a quite similar trajectories
# but they have different average value on the major axis
# To visualize this better, we can plot the scores along the major
# axis for the three species
boxplot(Pred_major_axis_versicolor_newdata$new_data_results$new_data_major_axis_proj[,1]~
iris$Species,
xlab="Species",
ylab="Score on the major axis")
# We can also visualize the deviations from the major axis
# For instance by putting the predictions of the two blocks together
# Computing differences and then performing a PCA
predictions_all_data=cbind(
Pred_major_axis_versicolor_newdata$new_data_results$new_data_Block1_proj_prediction_revert,
Pred_major_axis_versicolor_newdata$new_data_results$new_data_Block2_proj_prediction_revert)
# Get the predictions for the two blocks (sepals and petals)
# and put them back together
Euc_dist_from_predictions=unlist(lapply(seq(nrow(iris)),
function(i)
dist(rbind(iris[i,1:4],predictions_all_data[i,]))))
# for each flower, compute the Euclidean distance between
# the original values and what is predicted by the model
boxplot(Euc_dist_from_predictions~iris$Species,
xlab="Species",
ylab="Euclidean distance from prediction")
# I. setosa is the one which deviates the most from the prediction
Run the code above in your browser using DataLab