Checks if a dataset confirms to a given set of rules
Usage
check_data(
x,
rules,
xname = deparse(substitute(x)),
stop_on_fail = FALSE,
stop_on_warn = FALSE,
stop_on_error = FALSE,
stop_on_schema_fail = FALSE,
extra_columns = c("ignore", "warn", "fail")
)Arguments
- x
a dataset, either a
data.frame,dplyr::tibble,data.table::data.table,arrow::arrow_table,arrow::open_dataset, ordplyr::tbl(SQL connection). Can also be a named list of datasets when using reference rules.- rules
a list of
rules- xname
optional, a name for the x variable (only used for errors)
- stop_on_fail
when any of the rules fail, throw an error with stop
- stop_on_warn
when a warning is found in the code execution, throw an error with stop
- stop_on_error
when an error is found in the code execution, throw an error with stop
- stop_on_schema_fail
when any schema checks fail, throw an error with stop
- extra_columns
how to treat columns in
xthat are not declared in optionaldata_columnsattached to a ruleset. One of"ignore"(default),"warn", or"fail".
Examples
rs <- ruleset(
rule(mpg > 10),
rule(cyl %in% c(4, 6)), # missing 8
rule(qsec >= 14.5 & qsec <= 22.9)
)
rs
#> <Verification Ruleset with 3 elements>
#> [1] 'Rule for: mpg' matching `mpg > 10` (allow_na: FALSE)
#> [2] 'Rule for: cyl' matching `cyl %in% c(4, 6)` (allow_na: FALSE)
#> [3] 'Rule for: qsec' matching `qsec >= 14.5 & qsec <= 22.9` (allow_na: FALSE)
check_data(mtcars, rs)
#> check_type name expr allow_na negate tests
#> <char> <char> <char> <lgcl> <lgcl> <int>
#> 1: row_rule Rule for: mpg mpg > 10 FALSE FALSE 32
#> 2: row_rule Rule for: cyl cyl %in% c(4, 6) FALSE FALSE 32
#> 3: row_rule Rule for: qsec qsec >= 14.5 & qsec <= 22.9 FALSE FALSE 32
#> pass fail warn error time
#> <int> <int> <char> <char> <difftime>
#> 1: 32 0 0.0048727989 secs
#> 2: 18 14 0.0025248528 secs
#> 3: 32 0 0.0003015995 secs
# schema + relation checks in one output
orders <- data.frame(order_id = 1:3, customer_id = c(10, 99, NA), amount = c(10, -5, 20))
customers <- data.frame(customer_id = c(10, 11))
rs2 <- ruleset(
rule(amount >= 0, name = "amount non-negative"),
reference_rule(
local_col = "customer_id",
ref_dataset = "customers",
ref_col = "customer_id",
allow_na = TRUE
),
data_columns = list(
data_column("order_id", type = "int", optional = FALSE),
data_column("customer_id", type = "double", optional = FALSE),
data_column("amount", type = "double", optional = FALSE)
),
data_name = "orders"
)
check_data(list(orders = orders, customers = customers), rs2)
#> check_type name
#> <char> <char>
#> 1: schema Schema: column 'order_id' exists
#> 2: schema Schema: column 'order_id' has type 'int'
#> 3: schema Schema: column 'customer_id' exists
#> 4: schema Schema: column 'customer_id' has type 'double'
#> 5: schema Schema: column 'amount' exists
#> 6: schema Schema: column 'amount' has type 'double'
#> 7: row_rule amount non-negative
#> 8: reference_rule Reference rule: customer_id in customers$customer_id
#> expr allow_na negate tests pass fail
#> <char> <lgcl> <lgcl> <int> <int> <int>
#> 1: column_exists('order_id') FALSE FALSE 1 1 0
#> 2: column_type('order_id') == 'int' FALSE FALSE 1 1 0
#> 3: column_exists('customer_id') FALSE FALSE 1 1 0
#> 4: column_type('customer_id') == 'double' FALSE FALSE 1 1 0
#> 5: column_exists('amount') FALSE FALSE 1 1 0
#> 6: column_type('amount') == 'double' FALSE FALSE 1 1 0
#> 7: amount >= 0 FALSE FALSE 3 2 1
#> 8: customer_id %in% customers$customer_id TRUE FALSE 3 2 1
#> warn error time
#> <char> <char> <difftime>
#> 1: 0.000000e+00 secs
#> 2: 0.000000e+00 secs
#> 3: 0.000000e+00 secs
#> 4: 0.000000e+00 secs
#> 5: 0.000000e+00 secs
#> 6: 0.000000e+00 secs
#> 7: 3.309250e-04 secs
#> 8: 5.054474e-05 secs