Regular imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.

fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

# S3 method for data.frame
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

# S3 method for data.table
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

# S3 method for matrix
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

Arguments

x

a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables

model

a character - posibble options ("lda","lm_pred","lm_bayes","lm_noise")

posit_y

an integer/character - a position/name of dependent variable

posit_x

an integer/character vector - positions/names of independent variables

w

a numeric vector - a weighting variable - only positive values, Default:NULL

logreg

a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE

ridge

a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5

Value

load imputations in a numeric/logical/character/factor (similar to the input type) vector format

Methods (by class)

  • fill_NA(data.frame): S3 method for data.frame

  • fill_NA(data.table): s3 method for data.table

  • fill_NA(matrix): S3 method for matrix

Note

There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of independent variables is smaller than number of observations.

See also

Examples

library(miceFast)
library(dplyr)
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
library(data.table)
#> 
#> Attaching package: ‘data.table’
#> The following objects are masked from ‘package:dplyr’:
#> 
#>     between, first, last
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
  # Imputations with a grouping option (models are separately assessed for each group)
  # taking into account provided weights
  group_by(groups) %>%
  do(mutate(., Solar_R_imp = fill_NA(
    x = .,
    model = "lm_pred",
    posit_y = "Solar.R",
    posit_x = c("Wind", "Temp", "Intercept"),
    w = .[["weights"]]
  ))) %>%
  ungroup() %>%
  # Imputations - discrete variable
  mutate(x_character_imp = fill_NA(
    x = .,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp")
  )) %>%
  # logreg was used because almost log-normal distribution of Ozone
  # imputations around mean
  mutate(Ozone_imp1 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )) %>%
  # imputations using positions - Intercept, Temp
  mutate(Ozone_imp2 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )) %>%
  # multiple imputations (average of x30 imputations)
  # with a factor independent variable, weights and logreg options
  mutate(Ozone_imp3 = fill_NA_N(
    x = .,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  mutate(Ozone_imp4 = fill_NA_N(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  group_by(groups) %>%
  do(mutate(., Ozone_imp5 = fill_NA(
    x = .,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE
  ))) %>%
  do(mutate(., Ozone_imp6 = fill_NA_N(
    x = .,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 20
  ))) %>%
  ungroup() %>%
  # Average of a few methods
  mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
  # Protecting against collinearity or low number of observations - across small groups
  # Be carful when using a grouping option
  # because of lack of protection against collinearity or low number of observations.
  # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
  group_by(groups) %>%
  do(mutate(., Ozone_chac_imp = tryCatch(
    fill_NA(
      x = .,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c(
        "Intercept",
        "Month",
        "Day",
        "Temp",
        "x_character_imp"
      ),
      w = .[["weights"]]
    ),
    error = function(e) .[["Ozone_chac"]]
  ))) %>%
  ungroup()

# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#>   Ozone Solar.R  Wind  Temp   Day Intercept index weights groups x_cha…¹ Ozone…²
#>   <dbl>   <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl>   <dbl> <fct>  <chr>   <chr>  
#> 1    NA      NA  14.3    56     5         1     5   0.995 5      NA      NA     
#> 2    NA     194   8.6    69    10         1    10   0.995 5      (140,2… NA     
#> 3    NA      66  16.6    57    25         1    25   1.01  5      (0,70]  NA     
#> 4    NA     266  14.9    58    26         1    26   1.00  5      (210,2… NA     
#> 5    NA      NA   8      57    27         1    27   1.00  5      NA      NA     
#> # … with 12 more variables: Ozone_f <fct>, Ozone_high <lgl>, Solar_R_imp <dbl>,
#> #   x_character_imp <chr>, Ozone_imp1 <dbl>, Ozone_imp2 <dbl>,
#> #   Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>, Ozone_imp6 <dbl>,
#> #   Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>, and abbreviated variable names
#> #   ¹​x_character, ²​Ozone_chac

### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
  x = .SD,
  model = "lm_bayes",
  posit_y = "Solar.R",
  posit_x = c("Wind", "Temp", "Intercept"),
  w = .SD[["weights"]],
  k = 100
), by = .(groups)] %>%
  # Imputations - discrete variable
  .[, x_character_imp := fill_NA(
    x = .SD,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp", "groups")
  )] %>%
  # logreg was used because almost log-normal distribution of Ozone
  # imputations around mean
  .[, Ozone_imp1 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )] %>%
  # imputations using positions - Intercept, Temp
  .[, Ozone_imp2 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )] %>%
  # model with a factor independent variable
  # multiple imputations (average of x30 imputations)
  # with a factor independent variable, weights and logreg options
  .[, Ozone_imp3 := fill_NA_N(
    x = .SD,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp4 := fill_NA_N(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp5 := fill_NA(
    x = .SD,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE
  ), .(groups)] %>%
  .[, Ozone_imp6 := fill_NA_N(
    x = .SD,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 10
  ), .(groups)] %>%
  # Average of a few methods
  .[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
  # Protecting against collinearity or low number of observations - across small groups
  # Be carful when using a data.table grouping option
  # because of lack of protection against collinearity or low number of observations.
  # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))

  .[, Ozone_chac_imp := tryCatch(
    fill_NA(
      x = .SD,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c(
        "Intercept",
        "Month",
        "Day",
        "Temp",
        "x_character_imp"
      ),
      w = .SD[["weights"]]
    ),
    error = function(e) .SD[["Ozone_chac"]]
  ), .(groups)]
#>      Ozone Solar.R Wind Temp Day Intercept index   weights groups x_character
#>   1:    41     190  7.4   67   1         1     1 1.0186350      5   (140,210]
#>   2:    36     118  8.0   72   2         1     2 1.0107583      5    (70,140]
#>   3:    12     149 12.6   74   3         1     3 0.9891023      5   (140,210]
#>   4:    18     313 11.5   62   4         1     4 0.9913450      5   (280,350]
#>   5:    NA      NA 14.3   56   5         1     5 0.9945367      5        <NA>
#>  ---                                                                         
#> 149:    30     193  6.9   70  26         1   149 0.9985280      9   (140,210]
#> 150:    NA     145 13.2   77  27         1   150 1.0001786      9   (140,210]
#> 151:    14     191 14.3   75  28         1   151 1.0024673      9   (140,210]
#> 152:    18     131  8.0   76  29         1   152 0.9968826      9    (70,140]
#> 153:    20     223 11.5   68  30         1   153 1.0056592      9   (210,280]
#>      Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#>   1:    (40,60] (40,60]      FALSE    190.0000       (140,210]   41.00000
#>   2:    (20,40] (20,40]      FALSE    118.0000        (70,140]   36.00000
#>   3:     (0,20]  (0,20]      FALSE    149.0000       (140,210]   12.00000
#>   4:     (0,20]  (0,20]      FALSE    313.0000       (280,350]   18.00000
#>   5:       <NA>    <NA>         NA    107.2179          (0,70]   23.36856
#>  ---                                                                     
#> 149:    (20,40] (20,40]      FALSE    193.0000       (140,210]   30.00000
#> 150:       <NA>    <NA>         NA    145.0000       (140,210]   96.48078
#> 151:     (0,20]  (0,20]      FALSE    191.0000       (140,210]   14.00000
#> 152:     (0,20]  (0,20]      FALSE    131.0000        (70,140]   18.00000
#> 153:     (0,20]  (0,20]      FALSE    223.0000       (210,280]   20.00000
#>      Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#>   1:  41.000000  41.000000  41.000000  41.000000  41.000000     41.000000
#>   2:  36.000000  36.000000  36.000000  36.000000  36.000000     36.000000
#>   3:  12.000000  12.000000  12.000000  12.000000  12.000000     12.000000
#>   4:  18.000000  18.000000  18.000000  18.000000  18.000000     18.000000
#>   5:   6.504706   6.097045   4.468612   5.127981   9.407458      9.162393
#>  ---                                                                     
#> 149:  30.000000  30.000000  30.000000  30.000000  30.000000     30.000000
#> 150:  26.104521  27.832564  26.749170  24.046683  11.938056     35.525295
#> 151:  14.000000  14.000000  14.000000  14.000000  14.000000     14.000000
#> 152:  18.000000  18.000000  18.000000  18.000000  18.000000     18.000000
#> 153:  20.000000  20.000000  20.000000  20.000000  20.000000     20.000000
#>      Ozone_chac_imp
#>   1:        (40,60]
#>   2:        (20,40]
#>   3:         (0,20]
#>   4:         (0,20]
#>   5:           <NA>
#>  ---               
#> 149:        (20,40]
#> 150:           <NA>
#> 151:         (0,20]
#> 152:         (0,20]
#> 153:         (0,20]

# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#>    Ozone Solar.R Wind Temp Day Intercept index   weights groups x_character
#> 1:    NA      NA 14.3   56   5         1     5 0.9945367      5        <NA>
#> 2:    NA     194  8.6   69  10         1    10 0.9950548      5   (140,210]
#> 3:    NA      66 16.6   57  25         1    25 1.0124095      5      (0,70]
#> 4:    NA     266 14.9   58  26         1    26 1.0047912      5   (210,280]
#> 5:    NA      NA  8.0   57  27         1    27 0.9998296      5        <NA>
#>    Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> 1:       <NA>    <NA>         NA    107.2179          (0,70]   23.36856
#> 2:       <NA>    <NA>         NA    194.0000       (140,210]   26.03157
#> 3:       <NA>    <NA>         NA     66.0000          (0,70]   29.68484
#> 4:       <NA>    <NA>         NA    266.0000       (210,280]   71.09846
#> 5:       <NA>    <NA>         NA    136.1804          (0,70]   29.26658
#>    Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> 1:   6.504706   6.097045   4.468612   5.127981   9.407458      9.162393
#> 2:  22.936424  20.418704  24.059228  20.388899  30.532943     24.061295
#> 3:  12.489261   5.580260   4.953874   5.311351   8.644043     11.110604
#> 4:  23.215834   9.329280  11.101567   5.663351  14.235716     22.440701
#> 5:   7.668188   9.368873   7.037318   6.391228  13.669493     12.233613
#>    Ozone_chac_imp
#> 1:           <NA>
#> 2:           <NA>
#> 3:           <NA>
#> 4:           <NA>
#> 5:           <NA>