Regular imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
# S3 method for class 'data.frame'
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
# S3 method for class 'data.table'
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
# S3 method for class 'matrix'
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)
a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
a character - posibble options ("lda","lm_pred","lm_bayes","lm_noise")
an integer/character - a position/name of dependent variable
an integer/character vector - positions/names of independent variables
a numeric vector - a weighting variable - only positive values, Default:NULL
a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5
load imputations in a numeric/logical/character/factor (similar to the input type) vector format
fill_NA(data.frame)
: S3 method for data.frame
fill_NA(data.table)
: s3 method for data.table
fill_NA(matrix)
: S3 method for matrix
There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of independent variables is smaller than number of observations.
library(miceFast)
library(dplyr)
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
library(data.table)
#>
#> Attaching package: ‘data.table’
#> The following objects are masked from ‘package:dplyr’:
#>
#> between, first, last
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
group_by(groups) %>%
do(mutate(., Solar_R_imp = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .[["weights"]]
))) %>%
ungroup() %>%
# Imputations - discrete variable
mutate(x_character_imp = fill_NA(
x = .,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp")
)) %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
mutate(Ozone_imp1 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)) %>%
# imputations using positions - Intercept, Temp
mutate(Ozone_imp2 = fill_NA(
x = .,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)) %>%
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
mutate(Ozone_imp3 = fill_NA_N(
x = .,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
mutate(Ozone_imp4 = fill_NA_N(
x = .,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 30
)) %>%
group_by(groups) %>%
do(mutate(., Ozone_imp5 = fill_NA(
x = .,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE
))) %>%
do(mutate(., Ozone_imp6 = fill_NA_N(
x = .,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .[["weights"]],
logreg = TRUE,
k = 20
))) %>%
ungroup() %>%
# Average of a few methods
mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
group_by(groups) %>%
do(mutate(., Ozone_chac_imp = tryCatch(
fill_NA(
x = .,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .[["weights"]]
),
error = function(e) .[["Ozone_chac"]]
))) %>%
ungroup()
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
#> 1 NA NA 14.3 56 5 1 5 0.995 5 NA
#> 2 NA 194 8.6 69 10 1 10 0.995 5 (140,210]
#> 3 NA 66 16.6 57 25 1 25 1.01 5 (0,70]
#> 4 NA 266 14.9 58 26 1 26 1.00 5 (210,280]
#> 5 NA NA 8 57 27 1 27 1.00 5 NA
#> # ℹ 13 more variables: Ozone_chac <chr>, Ozone_f <fct>, Ozone_high <lgl>,
#> # Solar_R_imp <dbl>, x_character_imp <chr>, Ozone_imp1 <dbl>,
#> # Ozone_imp2 <dbl>, Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>,
#> # Ozone_imp6 <dbl>, Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>
### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Solar.R",
posit_x = c("Wind", "Temp", "Intercept"),
w = .SD[["weights"]],
k = 100
), by = .(groups)] %>%
# Imputations - discrete variable
.[, x_character_imp := fill_NA(
x = .SD,
model = "lda",
posit_y = "x_character",
posit_x = c("Wind", "Temp", "groups")
)] %>%
# logreg was used because almost log-normal distribution of Ozone
# imputations around mean
.[, Ozone_imp1 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept"),
logreg = TRUE
)] %>%
# imputations using positions - Intercept, Temp
.[, Ozone_imp2 := fill_NA(
x = .SD,
model = "lm_bayes",
posit_y = 1,
posit_x = c(4, 6),
logreg = TRUE
)] %>%
# model with a factor independent variable
# multiple imputations (average of x30 imputations)
# with a factor independent variable, weights and logreg options
.[, Ozone_imp3 := fill_NA_N(
x = .SD,
model = "lm_noise",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp4 := fill_NA_N(
x = .SD,
model = "lm_bayes",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 30
)] %>%
.[, Ozone_imp5 := fill_NA(
x = .SD,
model = "lm_pred",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE
), .(groups)] %>%
.[, Ozone_imp6 := fill_NA_N(
x = .SD,
model = "pmm",
posit_y = "Ozone",
posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
w = .SD[["weights"]],
logreg = TRUE,
k = 10
), .(groups)] %>%
# Average of a few methods
.[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
# Protecting against collinearity or low number of observations - across small groups
# Be carful when using a data.table grouping option
# because of lack of protection against collinearity or low number of observations.
# There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
.[, Ozone_chac_imp := tryCatch(
fill_NA(
x = .SD,
model = "lda",
posit_y = "Ozone_chac",
posit_x = c(
"Intercept",
"Month",
"Day",
"Temp",
"x_character_imp"
),
w = .SD[["weights"]]
),
error = function(e) .SD[["Ozone_chac"]]
), .(groups)]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups
#> <num> <num> <num> <num> <num> <num> <num> <num> <fctr>
#> 1: 41 190 7.4 67 1 1 1 1.0186350 5
#> 2: 36 118 8.0 72 2 1 2 1.0107583 5
#> 3: 12 149 12.6 74 3 1 3 0.9891023 5
#> 4: 18 313 11.5 62 4 1 4 0.9913450 5
#> 5: NA NA 14.3 56 5 1 5 0.9945367 5
#> ---
#> 149: 30 193 6.9 70 26 1 149 0.9985280 9
#> 150: NA 145 13.2 77 27 1 150 1.0001786 9
#> 151: 14 191 14.3 75 28 1 151 1.0024673 9
#> 152: 18 131 8.0 76 29 1 152 0.9968826 9
#> 153: 20 223 11.5 68 30 1 153 1.0056592 9
#> x_character Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp
#> <char> <char> <fctr> <lgcl> <num> <char>
#> 1: (140,210] (40,60] (40,60] FALSE 190.0000 (140,210]
#> 2: (70,140] (20,40] (20,40] FALSE 118.0000 (70,140]
#> 3: (140,210] (0,20] (0,20] FALSE 149.0000 (140,210]
#> 4: (280,350] (0,20] (0,20] FALSE 313.0000 (280,350]
#> 5: <NA> <NA> <NA> NA 109.0075 (0,70]
#> ---
#> 149: (140,210] (20,40] (20,40] FALSE 193.0000 (140,210]
#> 150: (140,210] <NA> <NA> NA 145.0000 (140,210]
#> 151: (140,210] (0,20] (0,20] FALSE 191.0000 (140,210]
#> 152: (70,140] (0,20] (0,20] FALSE 131.0000 (70,140]
#> 153: (210,280] (0,20] (0,20] FALSE 223.0000 (210,280]
#> Ozone_imp1 Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6
#> <num> <num> <num> <num> <num> <num>
#> 1: 41.00000 41.000000 41.000000 41.000000 41.000000 41.00000
#> 2: 36.00000 36.000000 36.000000 36.000000 36.000000 36.00000
#> 3: 12.00000 12.000000 12.000000 12.000000 12.000000 12.00000
#> 4: 18.00000 18.000000 18.000000 18.000000 18.000000 18.00000
#> 5: 37.27863 6.707116 5.428128 5.534204 5.127981 13.66949
#> ---
#> 149: 30.00000 30.000000 30.000000 30.000000 30.000000 30.00000
#> 150: 30.33189 61.381910 23.171726 28.634885 24.046683 25.46112
#> 151: 14.00000 14.000000 14.000000 14.000000 14.000000 14.00000
#> 152: 18.00000 18.000000 18.000000 18.000000 18.000000 18.00000
#> 153: 20.00000 20.000000 20.000000 20.000000 20.000000 20.00000
#> Ozone_imp_mix Ozone_chac_imp
#> <num> <char>
#> 1: 41.00000 (40,60]
#> 2: 36.00000 (20,40]
#> 3: 12.00000 (0,20]
#> 4: 18.00000 (0,20]
#> 5: 12.29093 <NA>
#> ---
#> 149: 30.00000 (20,40]
#> 150: 32.17137 <NA>
#> 151: 14.00000 (0,20]
#> 152: 18.00000 (0,20]
#> 153: 20.00000 (0,20]
# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> Ozone Solar.R Wind Temp Day Intercept index weights groups x_character
#> <num> <num> <num> <num> <num> <num> <num> <num> <fctr> <char>
#> 1: NA NA 14.3 56 5 1 5 0.9945367 5 <NA>
#> 2: NA 194 8.6 69 10 1 10 0.9950548 5 (140,210]
#> 3: NA 66 16.6 57 25 1 25 1.0124095 5 (0,70]
#> 4: NA 266 14.9 58 26 1 26 1.0047912 5 (210,280]
#> 5: NA NA 8.0 57 27 1 27 0.9998296 5 <NA>
#> Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> <char> <fctr> <lgcl> <num> <char> <num>
#> 1: <NA> <NA> NA 109.0075 (0,70] 37.27863
#> 2: <NA> <NA> NA 194.0000 (140,210] 59.40327
#> 3: <NA> <NA> NA 66.0000 (0,70] 39.00817
#> 4: <NA> <NA> NA 266.0000 (210,280] 156.31780
#> 5: <NA> <NA> NA 112.2568 (0,70] 19.56298
#> Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> <num> <num> <num> <num> <num> <num>
#> 1: 6.707116 5.428128 5.534204 5.127981 13.66949 12.29093
#> 2: 10.930479 27.271427 27.102997 20.388899 14.38842 26.58092
#> 3: 4.694893 5.513273 4.548383 5.311351 12.82412 11.98336
#> 4: 18.047066 11.236547 8.884315 5.663351 15.18932 35.88973
#> 5: 16.322812 9.198472 8.382237 6.391228 13.66949 12.25454
#> Ozone_chac_imp
#> <char>
#> 1: <NA>
#> 2: <NA>
#> 3: <NA>
#> 4: <NA>
#> 5: <NA>