Usage
subNonStandardNames(x,
standardCharacters=c(letters, LETTERS, ' ','.', ',', 0:9,
'"', "'", '-', '_', '(', ')', '[', ']', ''),
replacement='_',
gsubList=list(list(pattern='\\\\\\\\|\\\\',
replacement='"')),
removeSecondLine=TRUE,
nonStandardNames=Ecdat::nonEnglishNames, ...)
- x
{
character vector or matrix or a data.frame of character
vectors in which it is desired replace nonStandardNames[, 1]
in subNonStandardCharacters(x, ...) with the corresponding
element of nonStandardNames[, 2].
}
- standardCharacters, replacement, gsubList, ...
{
arguments passed to subNonStandardCharacters
}
- removeSecondLine
{
logical: If TRUE, delete anything following "\n" and return it as
an attribute "secondLine".
}
- nonStandardNames
{
data.frame or character matrix with two columns: Replace any
substring of x matching nonStandardNames[, 1] with the
corresponding elemeng of nonStandardNames[, 2]
}
1. removeSecondLine
2. x. <- subNonStandardCharacters(x, standardCharacters, replacement,
...)
3. Loop over all rows of nonStandardNames substituting anything
matching nonEnglishData[i, 1] with nonEnglishData[i,
2].
4. Eliminate leading and trailing blanks.
NOTE: On 13 May 2013 Jeff Newmiller at the University of California,
Davis, wrote, 'I think it is a fools errand to think that you can
automatically "normalize" arbitrary Unicode characters to an ASCII
form that everyone will agree on.' (This was a reply on
r-help@r-project.org, subject: "Re: [R] Matching names with non-
English characters".) Doubtless someone has software to do a better
job of this than what this function does, but I've so far been unable
to find it in R. If you know of a better solution to this problem,
I'd be pleased to hear from you. Spencer Graves
a character vector with all nonStandardCharacters replaced first
by replacement and then by the second column of
nonStandardNames for any that match the first column. If a
secondLine is found on any elements, it is returned as a "secondLine"
attribute.
[object Object]
sub
nonEnglishNames
subNonStandardCharacters##
## 1. Example
##
tstSNSN <- c('Raul', 'Ra`l', 'Torres,Raul', 'Torres, Raul',
"Robert C. \\Bobby\\\\", 'Ed \n --Vacancy', '', ' ')# confusion in character sets can create
# names like Names[2]
##
## 2. subNonStandardNames(vector)
##SNS2 <- subNonStandardNames(tstSNSN)
SNS2
# check
SNS2. <- c('Raul', 'Raul', tstSNSN[3:4],
'Robert C. "Bobby"', 'Ed', '', '')
attr(SNS2., 'secondLine') <- c(rep(NA, 5), ' --Vacancy', NA, NA)
SNS2.
stopifnot(
all.equal(SNS2, SNS2.)
)
##
## 2. subNonStandardNames(matrix)
##
tstmat <- parseName(tstSNSN, surnameFirst=TRUE)
submat <- subNonStandardNames(tstmat)
# check
SNSmat <- parseName(SNS2., surnameFirst=TRUE)
stopifnot(
all.equal(submat, SNSmat)
)
##
## 3. subNonStandardNames(data.frame)
##
tstdf <- as.data.frame(tstmat)
subdf <- subNonStandardNames(tstdf)
# check
SNSdf <- as.data.frame(SNSmat)
stopifnot(
all.equal(subdf, SNSdf)
)
manip