Multiple imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
# S3 method for class 'data.frame'
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
# S3 method for class 'data.table'
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
# S3 method for class 'matrix'
fill_NA_N(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
k = 10,
ridge = 1e-06
)
a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
a character - posibble options ("lm_bayes","lm_noise","pmm")
an integer/character - a position/name of dependent variable
an integer/character vector - positions/names of independent variables
a numeric vector - a weighting variable - only positive values, Default: NULL
a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
an integer - a number of multiple imputations or for pmm a number of closest points from which a one random value is taken, Default:10
a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5
load imputations in a numeric/character/factor (similar to the input type) vector format
fill_NA_N(data.frame)
: s3 method for data.frame
fill_NA_N(data.table)
: S3 method for data.table
fill_NA_N(matrix)
: S3 method for matrix
There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of variables is smaller than number of observations.
library(miceFast)
library(dplyr)
library(data.table)
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
group_by(groups) %>%
do(mutate(., Solar_R_imp = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .[["weights"]]
))) %>%
ungroup() %>%
# Imputations - discrete variable
mutate(x_character_imp = fill_NA(
x = .,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
)) %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
mutate(Ozone_imp1 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)) %>%
# imputations using positions - Intercept, Temp
mutate(Ozone_imp2 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)) %>%
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
mutate(Ozone_imp3 = fill_NA_N(
x = .,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
mutate(Ozone_imp4 = fill_NA_N(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
group_by(groups) %>%
do(mutate(., Ozone_imp5 = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE
))) %>%
do(mutate(., Ozone_imp6 = fill_NA_N(
x = .,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 20
))) %>%
ungroup() %>%
# Average of a few methods
mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
group_by(groups) %>%
do(mutate(., Ozone_chac_imp = tryCatch(
fill_NA(
x = .,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .[["weights"]]
),
error = function(e) .[["Ozone_chac"]]
))) %>%
ungroup()
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
#> 1 NA NA 14.3 56 5 1 5 0.995 5 NA
#> 2 NA 194 8.6 69 10 1 10 0.995 5 (140,210]
#> 3 NA 66 16.6 57 25 1 25 1.01 5 (0,70]
#> 4 NA 266 14.9 58 26 1 26 1.00 5 (210,280]
#> 5 NA NA 8 57 27 1 27 1.00 5 NA
#> # ℹ 13 more variables: Ozone_chac <chr>, Ozone_f <fct>, Ozone_high <lgl>,
#> # Solar_R_imp <dbl>, x_character_imp <chr>, Ozone_imp1 <dbl>,
#> # Ozone_imp2 <dbl>, Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>,
#> # Ozone_imp6 <dbl>, Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>
### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .SD[["weights"]],
k = 100
), by = .(groups)] %>%
# Imputations - discrete variable
.[, x_character_imp := fill_NA(
x = .SD,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp", "groups")
)] %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
.[, Ozone_imp1 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)] %>%
# imputations using positions - Intercept, Temp
.[, Ozone_imp2 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)] %>%
# model with a factor independent variable
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
.[, Ozone_imp3 := fill_NA_N(
x = .SD,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp4 := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp5 := fill_NA(
x = .SD,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE
), .(groups)] %>%
.[, Ozone_imp6 := fill_NA_N(
x = .SD,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 10
), .(groups)] %>%
# Average of a few methods
.[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a data.table grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
.[, Ozone_chac_imp := tryCatch(
fill_NA(
x = .SD,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .SD[["weights"]]
),
error = function(e) .SD[["Ozone_chac"]]
), .(groups)]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups
#> <num> <num> <num> <num> <num> <num> <num> <num> <fctr>
#> 1: 41 190 7.4 67 1 1 1 1.0186350 5
#> 2: 36 118 8.0 72 2 1 2 1.0107583 5
#> 3: 12 149 12.6 74 3 1 3 0.9891023 5
#> 4: 18 313 11.5 62 4 1 4 0.9913450 5
#> 5: NA NA 14.3 56 5 1 5 0.9945367 5
#> ---
#> 149: 30 193 6.9 70 26 1 149 0.9985280 9
#> 150: NA 145 13.2 77 27 1 150 1.0001786 9
#> 151: 14 191 14.3 75 28 1 151 1.0024673 9
#> 152: 18 131 8.0 76 29 1 152 0.9968826 9
#> 153: 20 223 11.5 68 30 1 153 1.0056592 9
#> x_character Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp
#> <char> <char> <fctr> <lgcl> <num> <char>
#> 1: (140,210] (40,60] (40,60] FALSE 190.00000 (140,210]
#> 2: (70,140] (20,40] (20,40] FALSE 118.00000 (70,140]
#> 3: (140,210] (0,20] (0,20] FALSE 149.00000 (140,210]
#> 4: (280,350] (0,20] (0,20] FALSE 313.00000 (280,350]
#> 5: <NA> <NA> <NA> NA 99.71227 (0,70]
#> ---
#> 149: (140,210] (20,40] (20,40] FALSE 193.00000 (140,210]
#> 150: (140,210] <NA> <NA> NA 145.00000 (140,210]
#> 151: (140,210] (0,20] (0,20] FALSE 191.00000 (140,210]
#> 152: (70,140] (0,20] (0,20] FALSE 131.00000 (70,140]
#> 153: (210,280] (0,20] (0,20] FALSE 223.00000 (210,280]
#> Ozone_imp1 Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6
#> <num> <num> <num> <num> <num> <num>
#> 1: 41.00000 41.000000 41.000000 41.000000 41.000000 41.000000
#> 2: 36.00000 36.000000 36.000000 36.000000 36.000000 36.000000
#> 3: 12.00000 12.000000 12.000000 12.000000 12.000000 12.000000
#> 4: 18.00000 18.000000 18.000000 18.000000 18.000000 18.000000
#> 5: 218.27389 4.395301 5.946297 4.993256 5.127981 9.407458
#> ---
#> 149: 30.00000 30.000000 30.000000 30.000000 30.000000 30.000000
#> 150: 78.66808 11.338809 25.557737 26.870680 24.046683 17.280067
#> 151: 14.00000 14.000000 14.000000 14.000000 14.000000 14.000000
#> 152: 18.00000 18.000000 18.000000 18.000000 18.000000 18.000000
#> 153: 20.00000 20.000000 20.000000 20.000000 20.000000 20.000000
#> Ozone_imp_mix Ozone_chac_imp
#> <num> <char>
#> 1: 41.00000 (40,60]
#> 2: 36.00000 (20,40]
#> 3: 12.00000 (0,20]
#> 4: 18.00000 (0,20]
#> 5: 41.35736 <NA>
#> ---
#> 149: 30.00000 (20,40]
#> 150: 30.62701 <NA>
#> 151: 14.00000 (0,20]
#> 152: 18.00000 (0,20]
#> 153: 20.00000 (0,20]
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> <num> <num> <num> <num> <num> <num> <num> <num> <fctr> <char>
#> 1: NA NA 14.3 56 5 1 5 0.9945367 5 <NA>
#> 2: NA 194 8.6 69 10 1 10 0.9950548 5 (140,210]
#> 3: NA 66 16.6 57 25 1 25 1.0124095 5 (0,70]
#> 4: NA 266 14.9 58 26 1 26 1.0047912 5 (210,280]
#> 5: NA NA 8.0 57 27 1 27 0.9998296 5 <NA>
#> Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> <char> <fctr> <lgcl> <num> <char> <num>
#> 1: <NA> <NA> NA 99.71227 (0,70] 218.27389
#> 2: <NA> <NA> NA 194.00000 (140,210] 14.29654
#> 3: <NA> <NA> NA 66.00000 (0,70] 79.75728
#> 4: <NA> <NA> NA 266.00000 (210,280] 33.15668
#> 5: <NA> <NA> NA 103.74996 (0,70] 109.40499
#> Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> <num> <num> <num> <num> <num> <num>
#> 1: 4.395301 5.946297 4.993256 5.127981 9.407458 41.35736
#> 2: 18.865449 25.249996 24.818516 20.388899 21.308840 20.82137
#> 3: 29.118009 5.250535 5.526799 5.311351 15.189318 23.35888
#> 4: 6.547952 10.537040 10.321269 5.663351 14.235716 13.41033
#> 5: 10.648794 8.478407 7.720920 6.391228 15.189318 26.30561
#> Ozone_chac_imp
#> <char>
#> 1: <NA>
#> 2: <NA>
#> 3: <NA>
#> 4: <NA>
#> 5: <NA>