#use data set from package
data(human_cleave)
#first nine columns represent subsequent nine amino acids from cleavage sites
#degenerate the sequence to reduce the dimensionality of the problem
#(use five groups instead of 20 amino acids)
deg_seqs <- degenerate(human_cleave[, 1L:9],
list(`1` = c(1, 6, 8, 10, 11, 18),
`2` = c(2, 13, 14, 16, 17),
`3` = c(5, 19, 20),
`4` = c(7, 9, 12, 15),
'5' = c(3, 4)))
#extract trigrams
trigrams <- count_ngrams(deg_seqs, 3, 1L:4, pos = TRUE)
#select features that differ between the two target groups
test1 <- test_features(human_cleave[, "tar"], trigrams)
#see a summary of the results
summary(test1)
#aggregate features in groups based on their p-value
gr <- cut(test1)
#analyze deeper the most significant n-grams
#get position map of n-grams
position_ngrams(gr[[1]])
#transform n-grams to more readable form
decode_ngrams(gr[[1]])
Run the code above in your browser using DataLab