if (mlr3misc::require_namespaces("polars", quietly = TRUE)) {
# Backend using a in-memory data set
data = iris
data$Sepal.Length[1:30] = NA
data$row_id = 1:150
data = polars::as_polars_lf(data)
b = DataBackendPolars$new(data, primary_key = "row_id")
# Object supports all accessors of DataBackend
print(b)
b$nrow
b$ncol
b$colnames
b$data(rows = 100:101, cols = "Species")
b$distinct(b$rownames, "Species")
# Classification task using this backend
task = mlr3::TaskClassif$new(id = "iris_polars", backend = b, target = "Species")
print(task)
head(task)
# Write a parquet file to scan
data$collect()$write_parquet("iris.parquet")
data = polars::pl$scan_parquet("iris.parquet")
# Backend that re-reads the parquet file if the connection fails
b = DataBackendPolars$new(data, "row_id",
connector = function() polars::pl$scan_parquet("iris.parquet"))
print(b)
# Define a backend on a subset of the database: do not use column "Sepal.Width"
data = data$select(
polars::pl$col(setdiff(colnames(data), "Sepal.Width"))
)$filter(
polars::pl$col("row_id")$is_in(1:120) # Use only first 120 rows
)
# Backend with only scanned data
b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE)
print(b)
# Query disinct values
b$distinct(b$rownames, "Species")
# Query number of missing values
b$missings(b$rownames, b$colnames)
# Cleanup
if (file.exists("iris.parquet")) {
file.remove("iris.parquet")
}
}
Run the code above in your browser using DataLab