Learn R Programming

ngstk

Introduction

The R package ngstk can be used to facilitate the analysis of NGS data, such as visualization, conversion of the data format for WEB service input and another purpose.

In NGS data analysis process, a few of duplicated small scripts, colors theme always be created by us. In most cases, we can’t use it in the future if we don’t remember when and where the script be created. ngstk is a framework that can be used to collect small script, colors theme and other should be packaged material.

The purples of ngstk is that help us to manage those small scripts systematically, store some of the useful material for NGS data analysis. Especially, data visualization, conversion of data format and various database ID were the mainly mission in the recently development cycle.

A simple guide can be found in here.

Installation

CRAN

#You can install this package directly from CRAN by running (from within R):
install.packages('ngstk')

Github

# Install the cutting edge development version from GitHub:
# install.packages("devtools")
devtools::install_github("JhuangLab/ngstk")

Zip/Tarball

  1. Download the appropriate zip file or tar.gz file from Github
  2. Unzip the file and change directories into the configr directory
  3. Run `R CMD INSTALL

pkg`

Usage

Data format conversion

demo_file <- system.file("extdata", "demo/proteinpaint/muts2pp_iseq.txt", package = "ngstk")
input_data <- read.table(demo_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
disease <- "T-ALL"
input_data <- data.frame(input_data, disease)
input_data$disease <- as.character(input_data$disease)

# Convert mutations data to proteinpaint input
result <- muts2pp(input_data, input_type = "iseq")
# Convert mutations data to cbioportal input
result <- muts2mutation_mapper(input_data, input_type = "iseq")
result <- muts2oncoprinter(input_data, input_type = "iseq")

demo_file <- system.file('extdata', 'demo/proteinpaint/fusions2pp_fusioncatcher.txt', package = 'ngstk')
input_data <- read.table(demo_file, sep = '\t', header = TRUE, stringsAsFactors = FALSE)
disease <- 'B-ALL'
sampletype <- 'diagnose'
input_data <- data.frame(input_data, disease, sampletype)
input_data$disease <- as.character(input_data$disease)
# Convert fusions data to proteinpaint input
hander_data <- fusions2pp(input_data, input_type = 'fusioncatcher')
# Convert fusions data to proteinpaint input (Meta rows)
hander_data <- fusions2pp_meta(input_data, input_type = 'fusioncatcher')

Data filtration

demo_file <- system.file("extdata", "demo/proteinpaint/fusions2pp_fusioncatcher.txt", package = "ngstk")
input_data <- read.table(demo_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
# Get data subset according the defined rule
mhander_extra_params = list(gene_5 = 1, gene_3 = 2, any_gene = "TCF3", fusions_any_match_flag = TRUE)
result_1 <- fusions_filter(input_data, mhander_extra_params = mhander_extra_params)

mhander_extra_params = list(gene_3 = 2, right_gene = "GYPA", fusions_right_match_flag = TRUE)
result_2 <- fusions_filter(input_data, mhander_extra_params = mhander_extra_params)

mhander_extra_params = list(gene_5 = 1, left_gene = "GYPA", fusions_left_match_flag = TRUE)
result_3 <- fusions_filter(input_data, mhander_extra_params = mhander_extra_params)

mhander_extra_params = list(gene_5 = 1, gene_3 = 2, left_gene = "GYPE", right_gene = "GYPA", fusions_full_match_flag = TRUE)
result_4 <- fusions_filter(input_data, mhander_extra_params = mhander_extra_params)

mhander_extra_params = list(gene_5 = 1, gene_3 = 2, left_gene = "GYPE", right_gene = "GYPA", fusions_anyfull_match_flag = TRUE)
result_5 <- fusions_filter(input_data, mhander_extra_params = mhander_extra_params)

Times

file_a <- tempfile()
file_b <- tempfile()
file.create(c(file_a, file_b))
x1 <- get_files_mtime(input_files = c(file_a, file_b))
x2 <- get_files_mtime(input_files = c(file_a, file_b), return_check = FALSE)
x3 <- get_files_mtime(input_files = c(file_a, file_b), return_mtime = FALSE)
x4 <- get_files_ctime(input_files = c(file_a, file_b))
x5 <- get_files_ctime(input_files = c(file_a, file_b), return_check = FALSE)

# time stamp
time_stamp()

Data split

x1 <- data.frame(col1 = 1:39, col2 = 1:39)
x <- split_row_data(x1, sections = 2)
x <- split_row_data(x1, sections = 3)
x1 <- data.frame(col1 = 1:10, col2 = 11:20)
x1.t <- t(x1)
x <- split_col_data(x1.t, sections = 3)
# split file
dat <- data.frame(col1 = 1:10000)
outfn <- tempfile()
write.table(dat, outfn, sep = "\t")
split_row_file(outfn)
split_row_file(outfn, use_system_split = TRUE)

Filename Process

files_dir <- system.file('extdata', 'demo/format', package = 'ngstk')
pattern <- '*.txt'
list.files(files_dir, pattern)
x <- format_filenames(files_dir = files_dir, pattern = pattern, profix = 'hg38_')

Command line utils functions

# Collect command line bins files in R package
rbin('ngstk', tempdir())

# Print sub commands
option_list <- list(
  make_option(c('-l', '--list-all-subcmds'), action = 'store_true',
               default = FALSE, help = 'Print all supported subcmds of ngsjs.')
 )
subcmds_list <- list(subcmd1 = 'Use method 1 to plot boxplot',
                      subcmd2 = 'Use method 2 to plot boxplot')
 description <- 'Method to plot boxplot'
 usage <- 'usage: %prog [options] [params]'
 opt_parser_obj <- opt_parser(subcmds_list = subcmds_list,
                             option_list = option_list,
                             description = description,
                             usage = usage)

# Print the command line message
# You can define the message order use 
# paramter help_order = c("description", "usage", "options", "subcmds", "epilogue"
print_help(opt_parser_obj)

Download functions

# Use future package to parallel download urls with logs
urls <- c(paste0('https://raw.githubusercontent.com/',
 'Miachol/ftp/master/files/images/bioinstaller/maftools3.png'), 
 paste0('https://raw.githubusercontent.com/',
 'Miachol/ftp/master/files/images/bioinstaller/maftools4.png'))
 par_download(urls, sprintf('%s/%s', tempdir(), basename(urls)))

Colors

set_colors('default')
set_colors('proteinpaint_mutations')
set_colors('proteinpaint_chromHMM_state')

Tools

Some of experimental or unpacked scripts or tools for NGS data analysis will be collected in ngstk package. A defined markdown document will tell you how to use it, such as QualityConfirm and gvmap.

QualityConfirm

QualityConfirm is a quality control tool for gene panel sequencing data. Usage of QualityConfirm can be found in QualityConfirm and the demo can help you to use it more easily.

gvmap

gvmap is an R package to draw mutations and fusions heatmap. It relies on configr, rsvg R package. This package is an external tool that will be develop independently by ytdai.

Theme

ngstk provide some of defined colors theme, you can directly download it.

Title = "ngstk theme configuration file (colors)"

[default]
colors = ["#0073c3", "#efc000", "#696969",
"#ce534c", "#7ba6db", "#035892",
"#052135", "#666633", "#660000", "#990000"]
[red_blue]
colors = ["#c20b01", "#196abd"]

[proteinpaint_mutations]
colors = ["#3987cc", "#ff7f0e", "#db3d3d", "#6633ff",
"#bbbbbb", "#9467bd", "#998199", "#8c564b", "#819981",
"#5781ff"]

[proteinpaint_domains]
colors = ["#a6d854", "#8dd3c7", "#fb8072", "#80b1d3", "#bebada", "#e5c494", "#fdb462", "#b3b3b3"]

[proteinpaint_chromHMM_state]
colors = ["#c0222c", "#f12424", "#ff00c7", "#d192fb", "#f9982f", "#fcc88e",
"#fbf876", "#a6d67b", "#1fb855", "#007d37", "#00a99e", "#11aaec",
"#186db9", "#3800f8", "#961a8b", "#47005f"]

[proteinpaint_significance]
colors = ["#aaaaaa", "#e99002", "#5bc0de", "#f04124", "#90c3d4", "#f04124", "#43ac6a"]

[adobe_color_cc_1]
colors = ["#FFE350", "#E8740C", "#FF0000", "#9C0CE8", "#0D43FF",
"#A6B212", "#1991FF", "#ECFF00", "#CC1E14", "#B25C58"]

Copy Link

Version

Install

install.packages('ngstk')

Monthly Downloads

14

Version

0.2.3

License

MIT + file LICENSE

Issues

Pull Requests

Stars

Forks

Maintainer

Jianfeng Li

Last Published

November 22nd, 2018

Functions in ngstk (0.2.3)

make_option

Functions to enable our OptionParser to recognize specific command line options (optparse).
format_filenames

Function to format filenames that can be used to unify the filenames style for more easily download or use
ngstk

ngstk can be used to facilitate the analysis of NGS data, such as visualization, conversion of data format for WEB service input and other purpose.
batch_file

Process the input file a batch of one batch
get_split_seqs

Function to calculate the split regions by sections and total numbers
rbin

Function to generate executable files for R package
set_colors

Function to get a series defined theme colors
fusions2oncoprinter

Function to convert fusion data to cbioportal Oncoprinter format.
format_pp_meta_gender

To format ProteinPaint input meta data of gender
opt_parser

A function to create an instance of a parser object with subcommands.
split_col_data

Data split function by colum
format_pp_meta_age

To format ProteinPaint input meta data of age
fusions2pp_meta

Function to convert fusion data to ProteinPaint heatmap meta rows format.
par_download

Function to download multiple file at the same time.
parse_args

Parse command line options (optparse).
split_row_file

Function to split big file to a series small files (by row)
supress_any_message

suppressWarnings(suppressMessages(...))
muts2oncoprinter

Function to convert mutation data to cbioportal Oncoprinter format.
format_pp_meta_fusions

To format ProteinPaint input meta data of gender
show_mhandlers

Function to show all avaliabe mhandler function
merge_table_files

Util function to merge multiple table files.
fusions_filter

Fusions handler_data filter that can be used to prepare the input data for downstream analysis
fusions2pp

Function to convert fusion data to ProteinPaint format.
get_files_mtime

Function to check file last change time and according the requirement to return check value
print_help

Printing an usage message from an OptionParser object
muts2pp

Function to convert mutation data to ProteinPaint format.
muts2mutation_mapper

Function to convert mutation data to cbioportal MutationMapper format.
parse_args2

Parse command line options.
split_list

Function to split list
get_files_ctime

Function to check file create time and according the requirement to return check value
split_row_data

Data split function by row
time_stamp

Function to generate time stamp in the files name or directories name.
set_tools

Function to generate tools path object
get_pp_samplegroup

Function to get samplegroup file that can be pass to Proteinpaint
show_handlers

Function to show all avaliabe handler function