data(BLOSUM45)
data(BLOSUM50)
data(BLOSUM62)
data(BLOSUM80)
data(BLOSUM100)
data(PAM30)
data(PAM40)
data(PAM70)
data(PAM120)
data(PAM250)
nucleotideSubstitutionMatrix(match = 1, mismatch = 0, baseOnly = FALSE, type = "DNA")
qualitySubstitutionMatrices(fuzzyMatch = c(0, 1), alphabetLength = 4L, qualityClass = "PhredQuality", bitScale = 1)
errorSubstitutionMatrices(errorProbability, fuzzyMatch = c(0, 1), alphabetLength = 4L, bitScale = 1)
TRUE
or FALSE
. If TRUE
, only
uses the letters in the "base" alphabet i.e. "A", "C", "G", "T"."PhredQuality"
,
"SolexaQuality"
, or "IlluminaQuality"
.nucleotideSubstitutionMatrix
produces a substitution matrix for all IUPAC
nucleic acid codes based upon match and mismatch parameters. errorSubstitutionMatrices
produces a two element list of numeric
square symmetric matrices, one for matches and one for mismatches. qualitySubstitutionMatrices
produces the substitution matrices
for Phred or Solexa quality-based reads.The BLOSUM45, BLOSUM62, BLOSUM80, PAM30 and PAM70 matrices were taken from NCBI stand-alone BLAST software.
The BLOSUM50, BLOSUM100, PAM40, PAM120 and PAM250 matrices were taken from ftp://ftp.ncbi.nih.gov/blast/matrices/
The quality matrices computed in qualitySubstitutionMatrices
are
based on the paper by Ketil Malde. Let $\epsilon_i$ be the probability
of an error in the base read. For "Phred"
quality measures $Q$
in $[0, 99]$, these error probabilities are given by
$\epsilon_i = 10^{-Q/10}$. For "Solexa"
quality measures
$Q$ in $[-5, 99]$, they are given by
$\epsilon_i = 1 - 1/(1 + 10^{-Q/10})$.
Assuming independence within and between base reads, the combined error
probability of a mismatch when the underlying bases do match is
$\epsilon_c = \epsilon_1 + \epsilon_2 - (n/(n-1)) * \epsilon_1 * \epsilon_2$,
where $n$ is the number of letters in the underlying alphabet. Using
$\epsilon_c$, the substitution score is given by when two bases match is given by
$b * \log_2(\gamma_{x,y} * (1 - \epsilon_c) * n + (1 - \gamma_{x,y}) * \epsilon_c * (n/(n-1)))$,
where $b$ is the bit-scaling for the scoring and $\gamma_{x,y}$ is the
probability that characters $x$ and $y$ represents the same underlying
information (e.g. using IUPAC, $\gamma_{A,A} = 1$ and $\gamma_{A,N} = 1/4$.
In the arguments listed above fuzzyMatch
represents $\gamma_{x,y}$
and errorProbability
represents $\epsilon_i$.
pairwiseAlignment
,
PairwiseAlignments-class,
DNAString-class,
AAString-class,
PhredQuality-class,
SolexaQuality-class,
IlluminaQuality-class
s1 <-
DNAString("ACTTCACCAGCTCCCTGGCGGTAAGTTGATCAAAGGAAACGCAAAGTTTTCAAG")
s2 <-
DNAString("GTTTCACTACTTCCTTTCGGGTAAGTAAATATATAAATATATAAAAATATAATTTTCATC")
## Fit a global pairwise alignment using edit distance scoring
pairwiseAlignment(s1, s2,
substitutionMatrix = nucleotideSubstitutionMatrix(0, -1, TRUE),
gapOpening = 0, gapExtension = 1)
## Examine quality-based match and mismatch bit scores for DNA/RNA
## strings in pairwiseAlignment.
## By default patternQuality and subjectQuality are PhredQuality(22L).
qualityMatrices <- qualitySubstitutionMatrices()
qualityMatrices["22", "22", "1"]
qualityMatrices["22", "22", "0"]
pairwiseAlignment(s1, s2)
## Get the substitution scores when the error probability is 0.1
subscores <- errorSubstitutionMatrices(errorProbability = 0.1)
submat <- matrix(subscores[,,"0"], 4, 4)
diag(submat) <- subscores[,,"1"]
dimnames(submat) <- list(DNA_ALPHABET[1:4], DNA_ALPHABET[1:4])
submat
pairwiseAlignment(s1, s2, substitutionMatrix = submat)
## Align two amino acid sequences with the BLOSUM62 matrix
aa1 <- AAString("HXBLVYMGCHFDCXVBEHIKQZ")
aa2 <- AAString("QRNYMYCFQCISGNEYKQN")
pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM62", gapOpening = 3, gapExtension = 1)
## See how the gap penalty influences the alignment
pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM62", gapOpening = 6, gapExtension = 2)
## See how the substitution matrix influences the alignment
pairwiseAlignment(aa1, aa2, substitutionMatrix = "BLOSUM50", gapOpening = 3, gapExtension = 1)
if (interactive()) {
## Compare our BLOSUM62 with BLOSUM62 from ftp://ftp.ncbi.nih.gov/blast/matrices/
data(BLOSUM62)
BLOSUM62["Q", "Z"]
file <- "ftp://ftp.ncbi.nih.gov/blast/matrices/BLOSUM62"
b62 <- as.matrix(read.table(file, check.names=FALSE))
b62["Q", "Z"]
}
Run the code above in your browser using DataLab