library(breastCancerVDX)
library(Biobase)
data(vdx)
gene.expr=exprs(vdx) # Gene expression of the package
vdx.annot=fData(vdx) # Annotation associated to the dataset
vdx.clinc=pData(vdx) # Clinical information associated to the dataset
# Identifying the sample identifiers associated to ER+ and ER- breast cancer
er.pos=which(vdx.clinc$er==1)
er.neg=which(vdx.clinc$er==0)
# Only keep columns 1 and 3, probeset identifiers and Gene symbols respectively
vdx.annot=vdx.annot[,c(1,3)]
all(rownames(gene.expr)==as.character(vdx.annot[,1])) # Checking if the probeset are ordered with respect to the dataset
all(colnames(gene.expr)==as.character(vdx.clinc[,1])) # Checking if the sample identifiers are order with respect to the dataset
rownames(gene.expr)=as.character(vdx.annot[,2]) # Changing the row identifiers to the gene identifiers of interest
#===== Because we have several measurements for a gene (multiple rows for a gene), we filter the genes
#===== Function to obtain the genes with highest variabilty among phenotypes
gene.nms.u=unique(rownames(gene.expr))
gene.nms=rownames(gene.expr)
indices=NULL
for(i in 1:length(gene.nms.u))
{
aux=which(gene.nms==gene.nms.u[i])
if(length(aux)>1){
var.r = apply(cbind(apply(gene.expr[aux,er.pos],1,mean),apply(gene.expr[aux,er.neg],1,mean)),1,var)
aux=aux[which.max(var.r)]
}
indices=c(indices,aux)
}
#===== Only keep the genes with most variability among the phenotypes of interest
gene.expr=gene.expr[indices,]
gene.nams=rownames(gene.expr) # The gene symbols of interest are stored here
dim(gene.expr)
#===== In the following R dataset it is stored the .gmt file associated to the MF from GO.
#===== So "reading the GMT" is the only step that we skip. But an example is provided on the
#===== help file associated to the function "ReadGMT".
data(AnnotationMFGO,package="BAGS")
data.gene.grps=DataGeneSets(AnnotationMFGO,gene.nams,10)
phntp.list=list(er.pos,er.neg)
data.mcmc=MCMCDataSet(gene.expr,data.gene.grps$DataGeneSetsIds,phntp.list)
Run the code above in your browser using DataLab