Learn R Programming

bcputility

bcputility is a wrapper for the command line utility program from SQL Server that does bulk imports/exports. The package assumes that bcp is already installed and is on the system search path. For large inserts to SQL Server over an ODBC connection (e.g. with the "DBI" package), writes can take a very long time as each row generates an individual insert statement. The bcp Utility greatly improves performance of large writes by using bulk inserts.

An export function is provided for convenience, but likely will not significantly improve performance over other methods.

Installation

You can install the released version of bcputility from CRAN with:

install.packages("bcputility")

Install the development version with:

devtools::install_github("tomroh/bcputility")

If bcp and sqlcmd is not on the system path or you want to override the default, set the option with the full file path:

options(bcputility.bcp.path = "<path-to-bcp>")
options(bcputility.sqlcmd.path = "<path-to-sqlcmd>")

Benchmarks

Benchmarks were performed with a local installation of SQL Server Express. When testing with a remote SQL Server, performance of bcp over odbc was further improved.

Import

library(DBI)
library(data.table)
library(bcputility)
server <- Sys.getenv('MSSQL_SERVER')
database <- Sys.getenv('MSSQL_DB')
driver <- 'ODBC Driver 17 for SQL Server'
set.seed(11)
n <- 1000000
importTable <- data.frame(
  int = sample(x = seq(1L, 10000L, 1L), size = n, replace = TRUE),
  numeric = sample(x = seq(0, 1, length.out = n/100), size = n,
    replace = TRUE),
  character = sample(x = state.abb, size = n, replace = TRUE),
  factor = sample(x = factor(x = month.abb, levels = month.abb),
    size = n, replace = TRUE),
  logical = sample(x = c(TRUE, FALSE), size = n, replace = TRUE),
  date = sample(x = seq(as.Date('2022-01-01'), as.Date('2022-12-31'),
    by = 'days'), size = n, replace = TRUE),
  datetime = sample(x = seq(as.POSIXct('2022-01-01 00:00:00'),
    as.POSIXct('2022-12-31 23:59:59'), by = 'min'), size = n, replace = TRUE)
)
connectArgs <- makeConnectArgs(server = server, database = database)
con <- DBI::dbConnect(odbc::odbc(),
                      Driver = "SQL Server",
                      Server = server,
                      Database = database)
importResults <- microbenchmark::microbenchmark(
  bcpImport1000 = {
    bcpImport(importTable,
              connectargs = connectArgs,
              table = 'importTable1',
              bcpOptions = list("-b", 1000, "-a", 4096, "-e", 10),
              overwrite = TRUE,
              stdout = FALSE)
    },
  bcpImport10000 = {
    bcpImport(importTable,
              connectargs = connectArgs,
              table = 'importTable2',
              bcpOptions = list("-b", 10000, "-a", 4096, "-e", 10),
              overwrite = TRUE,
              stdout = FALSE)
  },
  bcpImport50000 = {
    bcpImport(importTable,
              connectargs = connectArgs,
              table = 'importTable3',
              bcpOptions = list("-b", 50000, "-a", 4096, "-e", 10),
              overwrite = TRUE,
              stdout = FALSE)
  },
  bcpImport100000 = {
    bcpImport(importTable,
      connectargs = connectArgs,
      table = 'importTable4',
      bcpOptions = list("-b", 100000, "-a", 4096, "-e", 10),
      overwrite = TRUE,
      stdout = FALSE)
  },
  dbWriteTable = {
    con <- DBI::dbConnect(odbc::odbc(),
      Driver = driver,
      Server = server,
      Database = database,
      trusted_connection = 'yes')
    DBI::dbWriteTable(con, name = 'importTable5', importTable, overwrite = TRUE)
    },
  times = 30L,
  unit = 'seconds'
)
importResults
exprminlqmeanmedianuqmaxneval
bcpImport100015.01738516.61086817.40555517.65626518.10099019.4448230
bcpImport1000010.09126610.65792610.92673810.91657711.20818411.4602730
bcpImport500008.9824989.3375099.6773759.5715269.89617910.7770930
bcpImport1000008.7695989.3034739.5629219.5819279.85535510.3694930
dbWriteTable13.57095613.82070715.15450514.15900216.37898627.2881930

Time in seconds

Export Table

Note: bcp exports of data may not match the format of fwrite. dateTimeAs = 'write.csv' was used to make timings comparable, which decreased the performance of "data.table". Optimized write formats for date times from fwrite outperforms bcp for data that is small enough to be pulled into memory.

exportResults <- microbenchmark::microbenchmark(
  bcpExportChar = {
    bcpExport('inst/benchmarks/test1.csv',
              connectargs = connectArgs,
              table = 'importTableInit',
              fieldterminator = ',',
              stdout = FALSE)
    },
  bcpExportNchar = {
    bcpExport('inst/benchmarks/test2.csv',
              connectargs = connectArgs,
              table = 'importTableInit',
              fieldterminator = ',',
              stdout = FALSE)
  },
  fwriteQuery = {
    fwrite(DBI::dbReadTable(con, 'importTableInit'),
           'inst/benchmarks/test3.csv', dateTimeAs = 'write.csv',
           col.names = FALSE)
  },
  times = 30L,
  unit = 'seconds'
)
exportResults
exprminlqmeanmedianuqmaxneval
bcpExportChar2.5656542.7274772.7956702.7566852.7922913.35232530
bcpExportNchar2.5893672.7041352.7657842.7349572.7972863.47907430
fwriteQuery7.4297317.6028537.6458527.6547307.7036347.86841930

Time in seconds

Export Query

query <- 'SELECT * FROM [dbo].[importTable1] WHERE int < 1000'
queryResults <- microbenchmark::microbenchmark(
  bcpExportQueryChar = {
    bcpExport('inst/benchmarks/test4.csv',
              connectargs = connectArgs,
              query = query,
              fieldterminator = ',',
              stdout = FALSE)
  },
  bcpExportQueryNchar = {
    bcpExport('inst/benchmarks/test5.csv',
              connectargs = connectArgs,
              query = query,
              fieldterminator = ',',
              stdout = FALSE)
  },
  fwriteQuery = {
    fwrite(DBI::dbGetQuery(con, query),
           'inst/benchmarks/test6.csv', dateTimeAs = 'write.csv',
           col.names = FALSE)
  },
  times = 30L,
  unit = 'seconds'
)
queryResults
exprminlqmeanmedianuqmaxneval
bcpExportQueryChar0.34444910.43973170.45571190.44909240.46155730.723718230
bcpExportQueryNchar0.33052650.44447050.44126700.45006900.46059710.481589430
fwriteQuery0.67378790.71419330.74213770.73119980.75482330.914355530

Time in seconds

Import Geometry

Importing spatial data from 'sf' objects is also supported. The sql statements after import are to produce equivalent tables in the database.

library(sf)
nc <- st_read(system.file("gpkg/nc.gpkg", package = "sf"))
divN <- 10
shp1 <- cbind(nc[sample.int(nrow(nc), n / divN, replace = TRUE),],
  importTable[seq_len(n / divN), ],
  id = seq_len(n / divN))
geometryResults <- microbenchmark::microbenchmark(
  bcpImportGeometry = {
    bcpImport(shp1,
      connectargs = connectArgs,
      table = 'shp1',
      overwrite = TRUE,
      stdout = FALSE,
      spatialtype = 'geometry',
      bcpOptions = list("-b", 50000, "-a", 4096, "-m", 0))
  },
  odbcImportGeometry = {
    con <- DBI::dbConnect(odbc::odbc(),
      driver = driver,
      server = server,
      database = database,
      trusted_connection = 'yes')
    tableName <- 'shp2'
    spatialType <- 'geometry'
    geometryColumn <- 'geom'
    binaryColumn <- 'geomWkb'
    srid <- sf::st_crs(nc)$epsg
    shpBin2 <- data.table(shp1)
    data.table::set(x = shpBin2, j = binaryColumn,
      value = blob::new_blob(lapply(sf::st_as_binary(shpBin2[[geometryColumn]]),
        as.raw)))
    data.table::set(x = shpBin2, j = geometryColumn, value = NULL)
    dataTypes <- DBI::dbDataType(con, shpBin2)
    dataTypes[binaryColumn] <- 'varbinary(max)'
    DBI::dbWriteTable(conn = con, name = tableName, value = shpBin2,
      overwrite = TRUE, field.types = dataTypes)
    DBI::dbExecute(conn = con, sprintf('alter table %1$s add %2$s %3$s;',
      tableName, geometryColumn, spatialType))
    DBI::dbExecute(conn = con,
      sprintf('UPDATE %1$s
    SET geom = %3$s::STGeomFromWKB([%4$s], %2$d);
    ALTER TABLE %1$s DROP COLUMN [%4$s];', tableName, srid, spatialType,
        binaryColumn)
    )
  },
  bcpImportGeography = {
    bcpImport(shp1,
      connectargs = connectArgs,
      table = 'shp3',
      overwrite = TRUE,
      stdout = FALSE,
      spatialtype = 'geography',
      bcpOptions = list("-b", 50000, "-a", 4096, "-m", 0))
  },
  odbcImportGeography = {
    con <- DBI::dbConnect(odbc::odbc(),
      driver = driver,
      server = server,
      database = database,
      trusted_connection = 'yes')
    tableName <- 'shp4'
    spatialType <- 'geography'
    geometryColumn <- 'geom'
    binaryColumn <- 'geomWkb'
    srid <- sf::st_crs(nc)$epsg
    shpBin4 <- data.table(shp1)
    data.table::set(x = shpBin4, j = binaryColumn,
      value = blob::new_blob(lapply(sf::st_as_binary(shpBin4[[geometryColumn]]),
        as.raw)))
    data.table::set(x = shpBin4, j = geometryColumn, value = NULL)
    dataTypes <- DBI::dbDataType(con, shpBin4)
    dataTypes[binaryColumn] <- 'varbinary(max)'
    DBI::dbWriteTable(conn = con, name = tableName, value = shpBin4,
      overwrite = TRUE, field.types = dataTypes)
    DBI::dbExecute(conn = con, sprintf('alter table %1$s add %2$s %3$s;',
      tableName, geometryColumn, spatialType))
    DBI::dbExecute(conn = con,
      sprintf('UPDATE %1$s
    SET geom = %3$s::STGeomFromWKB([%4$s], %2$d);
    ALTER TABLE %1$s DROP COLUMN [%4$s];', tableName, srid, spatialType,
        binaryColumn)
    )
    DBI::dbExecute(conn = con,
      sprintf(
        'UPDATE %1$s SET [%2$s] = [%2$s].MakeValid().ReorientObject().MakeValid()
   WHERE [%2$s].MakeValid().EnvelopeAngle() > 90;',
        tableName, geometryColumn))
  },
  times = 30L,
  unit = 'seconds'
)
geometryResults
exprminlqmeanmedianuqmaxneval
bcpImportGeometry18.0145119.4874720.6883420.4513621.7421226.8703330
odbcImportGeometry18.2972120.6336322.3504421.2908724.0449027.8111230
bcpImportGeography71.2326075.0458882.6528676.3698596.68469102.7090930
odbcImportGeography73.2981876.1248184.5843277.9341997.36155107.0018630

Time in seconds

Copy Link

Version

Install

install.packages('bcputility')

Monthly Downloads

405

Version

0.4.3

License

MIT + file LICENSE

Issues

Pull Requests

Stars

Forks

Maintainer

Thomas Roh

Last Published

May 9th, 2024

Functions in bcputility (0.4.3)

bcpExport

Export data from SQL Server
SQLServerCLIVersions

Check bcp and sqlcmd versions
createTable

Create or drop table
makeConnectArgs

Create a named list of connection arguments to translate to bcp and sqlcmd options
bcpImport

Import data to SQL Server
mapDataTypes

Determine SQL Server data types from data frame. Follows SQL Server data type size constraints and chooses the smallest data type size.