# NOT RUN {
# create an example matrix and its transpose
min.dim <- 200; nvar <- 500; subset.size <- 50
mat <- matrix(rnorm(min.dim*nvar),ncol=min.dim)
prv.large(mat)
t.mat <- t(mat)
# create two alternative covariance matrices
MMs <- t.mat %*% mat
MsM <- mat %*% t.mat
# run singular value decomposition
pca <- svd(mat)
D <- pca$d # singular values (=sqrt(eigenvalues))
V <- pca$v # right singular vector
U <- pca$u # left singular vector
sig <- mat-mat; diag(sig) <- D;
MMs2 <- V %*% (t(sig) %*% sig) %*% t(V)
sig <- t.mat-t.mat; diag(sig) <- D;
MsM2 <- U %*% (sig %*% t(sig)) %*% t(U)
# show that the covariance matrices are equal to the functions of
# the left and right singular vectors
prv(MMs,MsM); prv(MMs2,MsM2)
pr <- princomp(mat) # PCA using eigendecomposition of cov matrix
L <- matrix(rep(0,40000),ncol=200); diag(L) <- pr[[1]]^2 # eigenvalues as diag
mat2 <- (pr[[2]]) %*% L %*% solve(pr[[2]]) # = eigenvectors * eigenvalues * inv(eigenvectors)
prv.large(cov(mat)); prv.large(mat2) # == COVmat (may be slight tolerance differences)
## Now demonstrate the correlation between SVD and PCA ##
# the right singular vector is highly correlated with the pca loadings:
median(abs(diag(cor(V,pr[["loadings"]]))))
# the left singular vector is highly correlated with the pca scores (eigenvectors):
median(abs(diag(cor(U,pr[["scores"]]))))
cor(pr$sdev,D) # the singular values are equivalent to the eigenvalues
## MAIN EXAMPLES ##
orig.dir <- getwd(); setwd(tempdir()); # move to temporary dir
if(file.exists("testMyBig.bck")) { unlink(c("testMyBig.bck","testMyBig.dsc")) }
bmat <- as.big.matrix(mat,backingfile="testMyBig.bck",
descriptorfile="testMyBig.dsc", backingpath = getwd())
result <- big.PCA(bmat) #,verbose=TRUE)
headl(result)
# plot the eigenvalues with a linear fit line and elbow placed at 13
Eigv <- pca.scree.plot(result$Evalues,M=bmat,elbow=6,printvar=FALSE)
rm(bmat)
unlink(c("testMyBig.bck","testMyBig.dsc"))
## generate some data with reasonable intercorrelations ##
mat2 <- sim.cor(500,200,genr=function(n){ (runif(n)/2+.5) })
bmat2 <- as.big.matrix(mat2,backingfile="testMyBig2.bck",
descriptorfile="testMyBig2.dsc", backingpath = getwd())
# calculate PCA on decreasing subset size
result2 <- big.PCA(bmat2,thin=FALSE)
normal <- result2$PCs; rm(result2)
result3 <- big.PCA(bmat2,thin=TRUE,keep=.5, pref="t1")
thinned <- result3$PCs; rm(result3)
result4 <- big.PCA(bmat2,thin=TRUE,keep=.5, pref="t2", how="cor")
corred <- result4$PCs; rm(result4)
result5 <- big.PCA(bmat2,thin=TRUE,keep=.5, pref="t3", how="pca")
pced <- result5$PCs; rm(result5)
result6 <- big.PCA(bmat2,thin=TRUE,keep=.2, pref="t4")
thinner <- result6$PCs; rm(result6)
## correlate the resulting PCs with the un-thinned PCs
cors.thin.with.orig <- apply(cor(normal,thinned),1,max)
cors.corred.with.orig <- apply(cor(normal,corred),1,max)
cors.pced.with.orig <- apply(cor(normal,pced),1,max)
cors.thinner.with.orig <-apply(cor(normal,thinner),1,max)
plot(cors.thin.with.orig,type="l",col="red",ylim=c(0,1))
lines(cors.thinner.with.orig,col="orange")
lines(cors.corred.with.orig,col="lightblue")
lines(cors.pced.with.orig,col="lightgreen")
# can see that the first component is highly preserved,
# and next components, somewhat preserved; try using different thinning methods
rm(bmat2)
unlink(c("testMyBig2.bck","testMyBig2.dsc"))
setwd(orig.dir)
# }
Run the code above in your browser using DataLab