# NOT RUN {
named.subject.vec <- c(
ten="chr10:213,054,000-213,055,000",
M="chrM:111,000",
one="chr1:110-111 chr2:220-222") # two possible matches.
## str_match_variable finds the first match in each element of the
## subject character vector. Named arguments are used to create
## named capture groups, which become column names in the
## result. Since the subject is named, those names are used for the
## rownames of the result.
(mat.subject.names <- namedCapture::str_match_variable(
named.subject.vec,
chrom="chr.*?",
":",
chromStart="[0-9,]+",
list( # un-named list becomes non-capturing group.
"-",
chromEnd="[0-9,]+"
), "?")) # chromEnd is optional.
## When no type conversion functions are specified, the result is a
## character matrix.
str(mat.subject.names)
## Conversion functions are used to convert the previously named
## group, and patterns may be saved in lists for re-use.
keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x))
int.pattern <- list("[0-9,]+", keep.digits)
range.pattern <- list(
name="chr.*?", # will be used for rownames when subject is un-named.
":",
chromStart=int.pattern,
list(
"-",
chromEnd=int.pattern
), "?")
## Rownames taken from subject if it has names.
(df.subject.names <- namedCapture::str_match_variable(
named.subject.vec, range.pattern))
## Conversion functions used to create non-char columns.
str(df.subject.names)
## Rownames taken from name group if subject is un-named.
namedCapture::str_match_variable(
unname(named.subject.vec), range.pattern)
## NA used to indicate no match or missing subject.
na.vec <- c(
nomatch="this will not match",
missing=NA, # neither will this.
named.subject.vec)
namedCapture::str_match_variable(
na.vec, range.pattern)
# }
Run the code above in your browser using DataLab