Multiple imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
# S3 method for data.frame
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
# S3 method for data.table
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
# S3 method for matrix
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
a character - posibble options ("lm_bayes","lm_noise","pmm")
an integer/character - a position/name of dependent variable
an integer/character vector - positions/names of independent variables
a numeric vector - a weighting variable - only positive values, Default: NULL
a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
an integer - a number of multiple imputations or for pmm a number of closest points from which a one random value is taken, Default:10
a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5
load imputations in a numeric/character/factor (similar to the input type) vector format
fill_NA_N(data.frame)
: s3 method for data.frame
fill_NA_N(data.table)
: S3 method for data.table
fill_NA_N(matrix)
: S3 method for matrix
There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of variables is smaller than number of observations.
library(miceFast)
library(dplyr)
library(data.table)
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
group_by(groups) %>%
do(mutate(., Solar_R_imp = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .[["weights"]]
))) %>%
ungroup() %>%
# Imputations - discrete variable
mutate(x_character_imp = fill_NA(
x = .,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
)) %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
mutate(Ozone_imp1 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)) %>%
# imputations using positions - Intercept, Temp
mutate(Ozone_imp2 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)) %>%
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
mutate(Ozone_imp3 = fill_NA_N(
x = .,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
mutate(Ozone_imp4 = fill_NA_N(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
group_by(groups) %>%
do(mutate(., Ozone_imp5 = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE
))) %>%
do(mutate(., Ozone_imp6 = fill_NA_N(
x = .,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 20
))) %>%
ungroup() %>%
# Average of a few methods
mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
group_by(groups) %>%
do(mutate(., Ozone_chac_imp = tryCatch(
fill_NA(
x = .,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .[["weights"]]
),
error = function(e) .[["Ozone_chac"]]
))) %>%
ungroup()
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_cha…¹ Ozone…²
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
#> 1 NA NA 14.3 56 5 1 5 0.995 5 NA NA
#> 2 NA 194 8.6 69 10 1 10 0.995 5 (140,2… NA
#> 3 NA 66 16.6 57 25 1 25 1.01 5 (0,70] NA
#> 4 NA 266 14.9 58 26 1 26 1.00 5 (210,2… NA
#> 5 NA NA 8 57 27 1 27 1.00 5 NA NA
#> # … with 12 more variables: Ozone_f <fct>, Ozone_high <lgl>, Solar_R_imp <dbl>,
#> # x_character_imp <chr>, Ozone_imp1 <dbl>, Ozone_imp2 <dbl>,
#> # Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>, Ozone_imp6 <dbl>,
#> # Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>, and abbreviated variable names
#> # ¹x_character, ²Ozone_chac
### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .SD[["weights"]],
k = 100
), by = .(groups)] %>%
# Imputations - discrete variable
.[, x_character_imp := fill_NA(
x = .SD,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp", "groups")
)] %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
.[, Ozone_imp1 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)] %>%
# imputations using positions - Intercept, Temp
.[, Ozone_imp2 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)] %>%
# model with a factor independent variable
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
.[, Ozone_imp3 := fill_NA_N(
x = .SD,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp4 := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp5 := fill_NA(
x = .SD,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE
), .(groups)] %>%
.[, Ozone_imp6 := fill_NA_N(
x = .SD,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 10
), .(groups)] %>%
# Average of a few methods
.[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a data.table grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
.[, Ozone_chac_imp := tryCatch(
fill_NA(
x = .SD,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .SD[["weights"]]
),
error = function(e) .SD[["Ozone_chac"]]
), .(groups)]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> 1: 41 190 7.4 67 1 1 1 1.0186350 5 (140,210]
#> 2: 36 118 8.0 72 2 1 2 1.0107583 5 (70,140]
#> 3: 12 149 12.6 74 3 1 3 0.9891023 5 (140,210]
#> 4: 18 313 11.5 62 4 1 4 0.9913450 5 (280,350]
#> 5: NA NA 14.3 56 5 1 5 0.9945367 5 <NA>
#> ---
#> 149: 30 193 6.9 70 26 1 149 0.9985280 9 (140,210]
#> 150: NA 145 13.2 77 27 1 150 1.0001786 9 (140,210]
#> 151: 14 191 14.3 75 28 1 151 1.0024673 9 (140,210]
#> 152: 18 131 8.0 76 29 1 152 0.9968826 9 (70,140]
#> 153: 20 223 11.5 68 30 1 153 1.0056592 9 (210,280]
#> Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> 1: (40,60] (40,60] FALSE 190.00000 (140,210] 41.00000
#> 2: (20,40] (20,40] FALSE 118.00000 (70,140] 36.00000
#> 3: (0,20] (0,20] FALSE 149.00000 (140,210] 12.00000
#> 4: (0,20] (0,20] FALSE 313.00000 (280,350] 18.00000
#> 5: <NA> <NA> NA 93.38071 (0,70] 39.86381
#> ---
#> 149: (20,40] (20,40] FALSE 193.00000 (140,210] 30.00000
#> 150: <NA> <NA> NA 145.00000 (140,210] 31.51104
#> 151: (0,20] (0,20] FALSE 191.00000 (140,210] 14.00000
#> 152: (0,20] (0,20] FALSE 131.00000 (70,140] 18.00000
#> 153: (0,20] (0,20] FALSE 223.00000 (210,280] 20.00000
#> Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> 1: 41.00000 41.000000 41.00000 41.000000 41.00000 41.00000
#> 2: 36.00000 36.000000 36.00000 36.000000 36.00000 36.00000
#> 3: 12.00000 12.000000 12.00000 12.000000 12.00000 12.00000
#> 4: 18.00000 18.000000 18.00000 18.000000 18.00000 18.00000
#> 5: 21.37254 5.262766 5.72173 5.127981 13.66949 15.16972
#> ---
#> 149: 30.00000 30.000000 30.00000 30.000000 30.00000 30.00000
#> 150: 12.19346 29.142736 20.93322 24.046683 23.53701 23.56069
#> 151: 14.00000 14.000000 14.00000 14.000000 14.00000 14.00000
#> 152: 18.00000 18.000000 18.00000 18.000000 18.00000 18.00000
#> 153: 20.00000 20.000000 20.00000 20.000000 20.00000 20.00000
#> Ozone_chac_imp
#> 1: (40,60]
#> 2: (20,40]
#> 3: (0,20]
#> 4: (0,20]
#> 5: <NA>
#> ---
#> 149: (20,40]
#> 150: <NA>
#> 151: (0,20]
#> 152: (0,20]
#> 153: (0,20]
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> 1: NA NA 14.3 56 5 1 5 0.9945367 5 <NA>
#> 2: NA 194 8.6 69 10 1 10 0.9950548 5 (140,210]
#> 3: NA 66 16.6 57 25 1 25 1.0124095 5 (0,70]
#> 4: NA 266 14.9 58 26 1 26 1.0047912 5 (210,280]
#> 5: NA NA 8.0 57 27 1 27 0.9998296 5 <NA>
#> Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> 1: <NA> <NA> NA 93.38071 (0,70] 39.86381
#> 2: <NA> <NA> NA 194.00000 (140,210] 50.32905
#> 3: <NA> <NA> NA 66.00000 (0,70] 70.39088
#> 4: <NA> <NA> NA 266.00000 (210,280] 80.25469
#> 5: <NA> <NA> NA 128.47330 (0,70] 77.42150
#> Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> 1: 21.372536 5.262766 5.721730 5.127981 13.66949 15.16972
#> 2: 42.128453 23.343879 22.115402 20.388899 26.05740 30.72718
#> 3: 20.857649 4.483203 5.502857 5.311351 14.23572 20.13028
#> 4: 7.254263 10.940727 9.095296 5.663351 14.23572 21.24067
#> 5: 4.876854 7.116634 6.480032 6.391228 17.52725 19.96892
#> Ozone_chac_imp
#> 1: <NA>
#> 2: <NA>
#> 3: <NA>
#> 4: <NA>
#> 5: <NA>