# We provide an example of one possible way to parse the Stringdb
# Homo Sapien protein actions network and prepare
# it to be used with our package. First, we need to
# upload the network which is attached to QuaternaryProd
# for convenience.
library(readr)
library(org.Hs.eg.db)
library(dplyr)
library(stringr)
library(fdrtool)
# Get the full file name containing the STRINGdb relations
ff <- system.file("extdata", "9606.protein.actions.v10.txt.gz"
, package="QuaternaryProd")
all_rels <- read_tsv(gzfile(ff), col_names = TRUE)
# Next, we filter out the important columns and important
# relations. We remove all rows which do not have a relation
# activation, inhibition and expression. Moreover, we
# also consider reverse causality for any relation which has
# a direction value equal to 0.
# Set new names for columns
names(all_rels) <- c("srcuid", "trguid", "mode", "action", "direction","score")
Rels <- all_rels[, c("srcuid", "trguid", "mode", "direction")]
# Get all rows with causal relations
Rels <- Rels[Rels$mode %in% c("activation", "inhibition","expression"),]
# Get causal relations where direction is not specified,
# and consider reversed direction of causality as a valid
# causal relation
Bidirectional <- Rels[Rels$direction == 0 ,
c("trguid", "srcuid", "mode", "direction")]
names(Bidirectional) <- c("srcuid", "trguid", "mode", "direction")
Rels <- unique(bind_rows(Rels, Bidirectional))
Rels$direction <- NULL
# Rename activation as increases, inhibition as decreases,
# expression as regulates
Rels$mode <- sub("activation", "increases", Rels$mode)
Rels$mode <- sub("inhibition", "decreases", Rels$mode)
Rels$mode <- sub("expression", "regulates", Rels$mode)
Rels <- unique(Rels)
# Get a subset of the network: Skip this step if you want the p-values
# of the scores corresponding to the source nodes computed over the
# entire network.
Rels <- Rels[sample(1:nrow(Rels), 40000, replace=FALSE),]
# Third, we extract the protein entities from the network, and
# we map them to their respective genes. Note, the entities could
# have been possibly a drug or compound, but we are working with
# this protein interactions network for the purpose of providing
# a nontrivial example.
# Get all unique protein ensemble ids in the causal network
allEns <- unique(c(Rels$srcuid, Rels$trguid))
# Map ensemble protein ids to entrez gene ids
map <- org.Hs.egENSEMBLPROT2EG
id <- unlist(mget(sub("9606.","",allEns), map, ifnotfound=NA))
id[is.na(id)] <- "-1"
uid <- paste("9606.", names(id), sep="")
# Function to map entrez ids to gene symbols
map <- org.Hs.egSYMBOL
symbol <- unlist(mget(id, map, ifnotfound=NA))
symbol[is.na(symbol)] <- "-1"
# Create data frame of STRINGdb protein Id, entrez id and gene
# symbol and type of entity
Ents <- data_frame(uid, id, symbol, type="protein")
Ents <- Ents[Ents$uid %in% allEns,]
# Remove ensemble ids in entities with duplicated entrez id
Ents <- Ents[!duplicated(Ents$id),]
# Add mRNAs to entities
uid <- paste("mRNA_", Ents$uid, sep = "")
mRNAs <- data_frame(uid=uid, id=Ents$id, symbol=Ents$symbol, type="mRNA")
Ents <- bind_rows(Ents, mRNAs)
# Get all unique relations
Rels$trguid <- paste("mRNA_", Rels$trguid, sep="")
Rels <- Rels[Rels$srcuid %in% Ents$uid & Rels$trguid %in% Ents$uid,]
Rels <- unique(Rels)
# Leave source proteins which contain at least 10 edges
sufficientRels <- group_by(Rels, srcuid) %>% summarise(count=n())
sufficientRels <- sufficientRels %>% filter(count > 10)
Rels <- Rels %>% filter(srcuid %in% sufficientRels$srcuid)
# Given new gene expression data, we can compute the scores and p-values
# for all source nodes in the network. BioQCREtoNet is a specialized
# function for this purpose.
# Gene expression data
evidence1 <- system.file("extdata", "e2f3_sig.txt", package = "QuaternaryProd")
evidence1 <- read.table(evidence1, sep = "\t", header = TRUE
, stringsAsFactors = FALSE)
# Remove duplicated entrez ids in evidence and rename column names appropriately
evidence1 <- evidence1[!duplicated(evidence1$entrez),]
names(evidence1) <- c("entrez", "pvalue", "fc")
# Run Quaternary CRE for entire Knowledge base on new evidence
# which computes the statistic for each of the source proteins
CRE_results <- BioQCREtoNet(Rels, evidence1, Ents, is.Logfc = TRUE)
Run the code above in your browser using DataLab