cr_ft_text: Get full text from a DOI

Description

Get full text from a DOI

Usage

cr_ft_text(url, type = "xml", path = "~/.crossref", overwrite = TRUE, read = TRUE, verbose = TRUE, cache = TRUE, ...)
cr_ft_plain(url, path = "~/.crossref", overwrite = TRUE, read = TRUE, verbose = TRUE, ...)
cr_ft_xml(url, path = "~/.crossref", overwrite = TRUE, read = TRUE, verbose = TRUE, ...)
cr_ft_pdf(url, path = "~/.crossref", overwrite = TRUE, read = TRUE, cache = FALSE, verbose = TRUE, ...)

Arguments

url

(character) A URL.

type

(character) One of xml, plain, pdf, or all

path

(character) Path to store pdfs in. Default: ~/.crossref/

overwrite

(logical) Overwrite file if it exists already? Default: TRUE

read

(logical) If reading a pdf, this toggles whether we extract text from the pdf or simply download. If TRUE, you get the text from the pdf back. If FALSE, you only get back the metadata. Default: TRUE

verbose

(logical) Print progress messages. Default: TRUE

cache

(logical) Use cached files or not. All files are written to your machine locally, so this doesn't affect that. This only states whether you want to use cached version so that you don't have to download the file again. The steps of extracting and reading into R still have to be performed when cache=TRUE. Default: TRUE

...

Named parameters passed on to GET

Details

Note that cr_ft_text, cr_ft_pdf, cr_ft_xml, cr_ft_plain are not vectorized.

Note that some links returned will not in fact lead you to full text content as you would understandbly think and expect. That is, if you use the filter parameter with e.g., cr_works and filter to only full text content, some links may actually give back only metadata for an article. Elsevier is perhaps the worst offender, for one because they have a lot of entries in Crossref TDM, but most of the links that are apparently full text are not in facct full text, but only metadata.

Examples

Run this code

## Not run: 
# # pdf link
# cr_ft_links(doi = "10.5555/515151", "pdf")
# 
# # xml and plain text links
# out <- cr_works(filter=c(has_full_text = TRUE))
# dois <- out$data$DOI
# cr_ft_links(dois[1], "pdf")
# cr_ft_links(dois[2], "xml")
# cr_ft_links(dois[1], "plain")
# cr_ft_links(dois[1], "all")
# 
# # No links
# cr_ft_links(cr_r(1), "xml")
# 
# # get full text
# ## pensoft
# out <- cr_members(2258, filter=c(has_full_text = TRUE), works = TRUE)
# (links <- cr_ft_links(out$data$DOI[1], "all"))
# ### xml
# cr_ft_text(links, 'xml')
# ### pdf
# cr_ft_text(links, "pdf", read=FALSE)
# cr_ft_text(links, "pdf")
# 
# ### another pensoft e.g.
# links <- cr_ft_links("10.3897/phytokeys.42.7604", "all")
# pdf_read <- cr_ft_text(url = links, type = "pdf", read=FALSE, verbose = FALSE)
# pdf <- cr_ft_text(links, "pdf", verbose = FALSE)
# 
# ## hindawi
# out <- cr_members(98, filter=c(has_full_text = TRUE), works = TRUE)
# (links <- cr_ft_links(out$data$DOI[1], "all"))
# ### xml
# cr_ft_text(links, 'xml')
# ### pdf
# cr_ft_text(links, "pdf", read=FALSE)
# cr_ft_text(links, "pdf")
# 
# ## search for works with full text, and with CC-BY 3.0 license
# ### you can see available licenses with cr_licenses() function
# out <-
#  cr_works(filter = list(has_full_text = TRUE,
#    license_url="http://creativecommons.org/licenses/by/3.0/"))
# (links <- cr_ft_links(out$data$DOI[10], "all"))
# cr_ft_text(links, 'xml')
# 
# ## You can use cr_ft_xml, cr_ft_plain, and cr_ft_pdf to go directly to that format
# licenseurl <- "http://creativecommons.org/licenses/by/3.0/"
# out <- cr_works(filter = list(has_full_text = TRUE, license_url = licenseurl))
# (links <- cr_ft_links(out$data$DOI[10], "all"))
# cr_ft_xml(links)
# cr_ft_pdf(links)
# 
# # Caching, for PDFs
# out <- cr_members(2258, filter=c(has_full_text = TRUE), works = TRUE)
# (links <- cr_ft_links(out$data$DOI[10], "all"))
# cr_ft_text(links, type = "pdf", cache=FALSE)
# system.time( cacheyes <- cr_ft_text(links, type = "pdf", cache=TRUE) )
# # second time should be faster
# system.time( cacheyes <- cr_ft_text(links, type = "pdf", cache=TRUE) )
# system.time( cacheno <- cr_ft_text(links, type = "pdf", cache=FALSE) )
# identical(cacheyes, cacheno)
# 
# ## elsevier
# ## requires extra authentication
# out <- cr_members(78, filter=c(has_full_text = TRUE), works = TRUE)
# ## set key first
# # Sys.setenv(CROSSREF_TDM_ELSEVIER = "your-key")
# ## XML
# link <- cr_ft_links(out$data$DOI[1], "xml")
# # res <- cr_ft_text(url = link, type = "xml")
# ## plain text
# link <- cr_ft_links(out$data$DOI[1], "plain")
# # res <- cr_ft_text(url = link, "plain")
# 
# ## Wiley
# Sys.setenv(CROSSREF_TDM = "your-key")
# 
# ### all wiley
# out <- cr_members(311, filter=c(has_full_text = TRUE, type = 'journal-article'), works = TRUE)
# dois <- out$data$DOI[1:10]
# # res <- list()
# # for (i in seq_along(dois)) {
# #   tmp <- cr_ft_links(dois[i], "all")
# #   tmp <- setNames(tmp, "pdf")
# #   attr(tmp[[1]], "type") <- "pdf"
# #   res[[i]] <- cr_ft_text(tmp, type = "pdf", cache=FALSE)
# # }
# # res
# 
# #### older dates
# out <- cr_members(311, filter=c(has_full_text = TRUE,
#       type = 'journal-article', until_created_date = "2013-12-31"), works = TRUE)
# 
# dois <- out$data$DOI[1:10]
# # res <- list()
# # for (i in seq_along(dois)) {
# #   tmp <- cr_ft_links(dois[i], "all")
# #   tmp <- setNames(tmp, "pdf")
# #   attr(tmp[[1]], "type") <- "pdf"
# #   res[[i]] <- cr_ft_text(tmp, type = "pdf", cache=FALSE)
# # }
# # res
# 
# ### wiley subset with CC By 4.0 license
# lic <- "http://creativecommons.org/licenses/by/4.0/"
# out <- cr_members(311, filter=c(has_full_text = TRUE, license.url = lic), works = TRUE)
# dois <- out$data$DOI[1:10]
# # res <- list()
# # for (i in seq_along(dois)) {
# #   tmp <- cr_ft_links(dois[i], "all")
# #   tmp <- setNames(tmp, "pdf")
# #   attr(tmp[[1]], "type") <- "pdf"
# #   res[[i]] <- cr_ft_text(tmp, type = "pdf", cache=FALSE)
# # }
# ## End(Not run)

Run the code above in your browser using DataLab