Regular imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
# S3 method for data.frame
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
# S3 method for data.table
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
# S3 method for matrix
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
a character - posibble options ("lda","lm_pred","lm_bayes","lm_noise")
an integer/character - a position/name of dependent variable
an integer/character vector - positions/names of independent variables
a numeric vector - a weighting variable - only positive values, Default:NULL
a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5
load imputations in a numeric/logical/character/factor (similar to the input type) vector format
fill_NA(data.frame)
: S3 method for data.frame
fill_NA(data.table)
: s3 method for data.table
fill_NA(matrix)
: S3 method for matrix
There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of independent variables is smaller than number of observations.
library(miceFast)
library(dplyr)
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
library(data.table)
#>
#> Attaching package: ‘data.table’
#> The following objects are masked from ‘package:dplyr’:
#>
#> between, first, last
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
group_by(groups) %>%
do(mutate(., Solar_R_imp = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .[["weights"]]
))) %>%
ungroup() %>%
# Imputations - discrete variable
mutate(x_character_imp = fill_NA(
x = .,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
)) %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
mutate(Ozone_imp1 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)) %>%
# imputations using positions - Intercept, Temp
mutate(Ozone_imp2 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)) %>%
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
mutate(Ozone_imp3 = fill_NA_N(
x = .,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
mutate(Ozone_imp4 = fill_NA_N(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
group_by(groups) %>%
do(mutate(., Ozone_imp5 = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE
))) %>%
do(mutate(., Ozone_imp6 = fill_NA_N(
x = .,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 20
))) %>%
ungroup() %>%
# Average of a few methods
mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
group_by(groups) %>%
do(mutate(., Ozone_chac_imp = tryCatch(
fill_NA(
x = .,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .[["weights"]]
),
error = function(e) .[["Ozone_chac"]]
))) %>%
ungroup()
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_cha…¹ Ozone…²
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
#> 1 NA NA 14.3 56 5 1 5 0.995 5 NA NA
#> 2 NA 194 8.6 69 10 1 10 0.995 5 (140,2… NA
#> 3 NA 66 16.6 57 25 1 25 1.01 5 (0,70] NA
#> 4 NA 266 14.9 58 26 1 26 1.00 5 (210,2… NA
#> 5 NA NA 8 57 27 1 27 1.00 5 NA NA
#> # … with 12 more variables: Ozone_f <fct>, Ozone_high <lgl>, Solar_R_imp <dbl>,
#> # x_character_imp <chr>, Ozone_imp1 <dbl>, Ozone_imp2 <dbl>,
#> # Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>, Ozone_imp6 <dbl>,
#> # Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>, and abbreviated variable names
#> # ¹x_character, ²Ozone_chac
### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .SD[["weights"]],
k = 100
), by = .(groups)] %>%
# Imputations - discrete variable
.[, x_character_imp := fill_NA(
x = .SD,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp", "groups")
)] %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
.[, Ozone_imp1 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)] %>%
# imputations using positions - Intercept, Temp
.[, Ozone_imp2 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)] %>%
# model with a factor independent variable
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
.[, Ozone_imp3 := fill_NA_N(
x = .SD,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp4 := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp5 := fill_NA(
x = .SD,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE
), .(groups)] %>%
.[, Ozone_imp6 := fill_NA_N(
x = .SD,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 10
), .(groups)] %>%
# Average of a few methods
.[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a data.table grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
.[, Ozone_chac_imp := tryCatch(
fill_NA(
x = .SD,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .SD[["weights"]]
),
error = function(e) .SD[["Ozone_chac"]]
), .(groups)]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> 1: 41 190 7.4 67 1 1 1 1.0186350 5 (140,210]
#> 2: 36 118 8.0 72 2 1 2 1.0107583 5 (70,140]
#> 3: 12 149 12.6 74 3 1 3 0.9891023 5 (140,210]
#> 4: 18 313 11.5 62 4 1 4 0.9913450 5 (280,350]
#> 5: NA NA 14.3 56 5 1 5 0.9945367 5 <NA>
#> ---
#> 149: 30 193 6.9 70 26 1 149 0.9985280 9 (140,210]
#> 150: NA 145 13.2 77 27 1 150 1.0001786 9 (140,210]
#> 151: 14 191 14.3 75 28 1 151 1.0024673 9 (140,210]
#> 152: 18 131 8.0 76 29 1 152 0.9968826 9 (70,140]
#> 153: 20 223 11.5 68 30 1 153 1.0056592 9 (210,280]
#> Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> 1: (40,60] (40,60] FALSE 190.0000 (140,210] 41.00000
#> 2: (20,40] (20,40] FALSE 118.0000 (70,140] 36.00000
#> 3: (0,20] (0,20] FALSE 149.0000 (140,210] 12.00000
#> 4: (0,20] (0,20] FALSE 313.0000 (280,350] 18.00000
#> 5: <NA> <NA> NA 107.2179 (0,70] 23.36856
#> ---
#> 149: (20,40] (20,40] FALSE 193.0000 (140,210] 30.00000
#> 150: <NA> <NA> NA 145.0000 (140,210] 96.48078
#> 151: (0,20] (0,20] FALSE 191.0000 (140,210] 14.00000
#> 152: (0,20] (0,20] FALSE 131.0000 (70,140] 18.00000
#> 153: (0,20] (0,20] FALSE 223.0000 (210,280] 20.00000
#> Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> 1: 41.000000 41.000000 41.000000 41.000000 41.000000 41.000000
#> 2: 36.000000 36.000000 36.000000 36.000000 36.000000 36.000000
#> 3: 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000
#> 4: 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000
#> 5: 6.504706 6.097045 4.468612 5.127981 9.407458 9.162393
#> ---
#> 149: 30.000000 30.000000 30.000000 30.000000 30.000000 30.000000
#> 150: 26.104521 27.832564 26.749170 24.046683 11.938056 35.525295
#> 151: 14.000000 14.000000 14.000000 14.000000 14.000000 14.000000
#> 152: 18.000000 18.000000 18.000000 18.000000 18.000000 18.000000
#> 153: 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000
#> Ozone_chac_imp
#> 1: (40,60]
#> 2: (20,40]
#> 3: (0,20]
#> 4: (0,20]
#> 5: <NA>
#> ---
#> 149: (20,40]
#> 150: <NA>
#> 151: (0,20]
#> 152: (0,20]
#> 153: (0,20]
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> 1: NA NA 14.3 56 5 1 5 0.9945367 5 <NA>
#> 2: NA 194 8.6 69 10 1 10 0.9950548 5 (140,210]
#> 3: NA 66 16.6 57 25 1 25 1.0124095 5 (0,70]
#> 4: NA 266 14.9 58 26 1 26 1.0047912 5 (210,280]
#> 5: NA NA 8.0 57 27 1 27 0.9998296 5 <NA>
#> Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> 1: <NA> <NA> NA 107.2179 (0,70] 23.36856
#> 2: <NA> <NA> NA 194.0000 (140,210] 26.03157
#> 3: <NA> <NA> NA 66.0000 (0,70] 29.68484
#> 4: <NA> <NA> NA 266.0000 (210,280] 71.09846
#> 5: <NA> <NA> NA 136.1804 (0,70] 29.26658
#> Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> 1: 6.504706 6.097045 4.468612 5.127981 9.407458 9.162393
#> 2: 22.936424 20.418704 24.059228 20.388899 30.532943 24.061295
#> 3: 12.489261 5.580260 4.953874 5.311351 8.644043 11.110604
#> 4: 23.215834 9.329280 11.101567 5.663351 14.235716 22.440701
#> 5: 7.668188 9.368873 7.037318 6.391228 13.669493 12.233613
#> Ozone_chac_imp
#> 1: <NA>
#> 2: <NA>
#> 3: <NA>
#> 4: <NA>
#> 5: <NA>