Multiple imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.

fill_NA_N(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-06
)

# S3 method for data.frame
fill_NA_N(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-06
)

# S3 method for data.table
fill_NA_N(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-06
)

# S3 method for matrix
fill_NA_N(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-06
)

Arguments

x

a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables

model

a character - posibble options ("lm_bayes","lm_noise","pmm")

posit_y

an integer/character - a position/name of dependent variable

posit_x

an integer/character vector - positions/names of independent variables

w

a numeric vector - a weighting variable - only positive values, Default: NULL

logreg

a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE

k

an integer - a number of multiple imputations or for pmm a number of closest points from which a one random value is taken, Default:10

ridge

a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5

Value

load imputations in a numeric/character/factor (similar to the input type) vector format

Methods (by class)

  • fill_NA_N(data.frame): s3 method for data.frame

  • fill_NA_N(data.table): S3 method for data.table

  • fill_NA_N(matrix): S3 method for matrix

Note

There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of variables is smaller than number of observations.

See also

Examples

library(miceFast)
library(dplyr)
library(data.table)
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
  # Imputations with a grouping option (models are separately assessed for each group)
  # taking into account provided weights
  group_by(groups) %>%
  do(mutate(., Solar_R_imp = fill_NA(
    x = .,
    model = "lm_pred",
    posit_y = "Solar.R",
    posit_x = c("Wind", "Temp", "Intercept"),
    w = .[["weights"]]
  ))) %>%
  ungroup() %>%
  # Imputations - discrete variable
  mutate(x_character_imp = fill_NA(
    x = .,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp")
  )) %>%
  # logreg was used because almost log-normal distribution of Ozone
  # imputations around mean
  mutate(Ozone_imp1 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )) %>%
  # imputations using positions - Intercept, Temp
  mutate(Ozone_imp2 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )) %>%
  # multiple imputations (average of x30 imputations)
  # with a factor independent variable, weights and logreg options
  mutate(Ozone_imp3 = fill_NA_N(
    x = .,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  mutate(Ozone_imp4 = fill_NA_N(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  group_by(groups) %>%
  do(mutate(., Ozone_imp5 = fill_NA(
    x = .,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE
  ))) %>%
  do(mutate(., Ozone_imp6 = fill_NA_N(
    x = .,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 20
  ))) %>%
  ungroup() %>%
  # Average of a few methods
  mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
  # Protecting against collinearity or low number of observations - across small groups
  # Be carful when using a grouping option
  # because of lack of protection against collinearity or low number of observations.
  # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
  group_by(groups) %>%
  do(mutate(., Ozone_chac_imp = tryCatch(
    fill_NA(
      x = .,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c(
        "Intercept",
        "Month",
        "Day",
        "Temp",
        "x_character_imp"
      ),
      w = .[["weights"]]
    ),
    error = function(e) .[["Ozone_chac"]]
  ))) %>%
  ungroup()

# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#>   Ozone Solar.R  Wind  Temp   Day Intercept index weights groups x_cha…¹ Ozone…²
#>   <dbl>   <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl>   <dbl> <fct>  <chr>   <chr>  
#> 1    NA      NA  14.3    56     5         1     5   0.995 5      NA      NA     
#> 2    NA     194   8.6    69    10         1    10   0.995 5      (140,2… NA     
#> 3    NA      66  16.6    57    25         1    25   1.01  5      (0,70]  NA     
#> 4    NA     266  14.9    58    26         1    26   1.00  5      (210,2… NA     
#> 5    NA      NA   8      57    27         1    27   1.00  5      NA      NA     
#> # … with 12 more variables: Ozone_f <fct>, Ozone_high <lgl>, Solar_R_imp <dbl>,
#> #   x_character_imp <chr>, Ozone_imp1 <dbl>, Ozone_imp2 <dbl>,
#> #   Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>, Ozone_imp6 <dbl>,
#> #   Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>, and abbreviated variable names
#> #   ¹​x_character, ²​Ozone_chac

### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
  x = .SD,
  model = "lm_bayes",
  posit_y = "Solar.R",
  posit_x = c("Wind", "Temp", "Intercept"),
  w = .SD[["weights"]],
  k = 100
), by = .(groups)] %>%
  # Imputations - discrete variable
  .[, x_character_imp := fill_NA(
    x = .SD,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp", "groups")
  )] %>%
  # logreg was used because almost log-normal distribution of Ozone
  # imputations around mean
  .[, Ozone_imp1 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )] %>%
  # imputations using positions - Intercept, Temp
  .[, Ozone_imp2 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )] %>%
  # model with a factor independent variable
  # multiple imputations (average of x30 imputations)
  # with a factor independent variable, weights and logreg options
  .[, Ozone_imp3 := fill_NA_N(
    x = .SD,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp4 := fill_NA_N(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp5 := fill_NA(
    x = .SD,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE
  ), .(groups)] %>%
  .[, Ozone_imp6 := fill_NA_N(
    x = .SD,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 10
  ), .(groups)] %>%
  # Average of a few methods
  .[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
  # Protecting against collinearity or low number of observations - across small groups
  # Be carful when using a data.table grouping option
  # because of lack of protection against collinearity or low number of observations.
  # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))

  .[, Ozone_chac_imp := tryCatch(
    fill_NA(
      x = .SD,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c(
        "Intercept",
        "Month",
        "Day",
        "Temp",
        "x_character_imp"
      ),
      w = .SD[["weights"]]
    ),
    error = function(e) .SD[["Ozone_chac"]]
  ), .(groups)]
#>      Ozone Solar.R Wind Temp Day Intercept index   weights groups x_character
#>   1:    41     190  7.4   67   1         1     1 1.0186350      5   (140,210]
#>   2:    36     118  8.0   72   2         1     2 1.0107583      5    (70,140]
#>   3:    12     149 12.6   74   3         1     3 0.9891023      5   (140,210]
#>   4:    18     313 11.5   62   4         1     4 0.9913450      5   (280,350]
#>   5:    NA      NA 14.3   56   5         1     5 0.9945367      5        <NA>
#>  ---                                                                         
#> 149:    30     193  6.9   70  26         1   149 0.9985280      9   (140,210]
#> 150:    NA     145 13.2   77  27         1   150 1.0001786      9   (140,210]
#> 151:    14     191 14.3   75  28         1   151 1.0024673      9   (140,210]
#> 152:    18     131  8.0   76  29         1   152 0.9968826      9    (70,140]
#> 153:    20     223 11.5   68  30         1   153 1.0056592      9   (210,280]
#>      Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#>   1:    (40,60] (40,60]      FALSE   190.00000       (140,210]   41.00000
#>   2:    (20,40] (20,40]      FALSE   118.00000        (70,140]   36.00000
#>   3:     (0,20]  (0,20]      FALSE   149.00000       (140,210]   12.00000
#>   4:     (0,20]  (0,20]      FALSE   313.00000       (280,350]   18.00000
#>   5:       <NA>    <NA>         NA    93.38071          (0,70]   39.86381
#>  ---                                                                     
#> 149:    (20,40] (20,40]      FALSE   193.00000       (140,210]   30.00000
#> 150:       <NA>    <NA>         NA   145.00000       (140,210]   31.51104
#> 151:     (0,20]  (0,20]      FALSE   191.00000       (140,210]   14.00000
#> 152:     (0,20]  (0,20]      FALSE   131.00000        (70,140]   18.00000
#> 153:     (0,20]  (0,20]      FALSE   223.00000       (210,280]   20.00000
#>      Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#>   1:   41.00000  41.000000   41.00000  41.000000   41.00000      41.00000
#>   2:   36.00000  36.000000   36.00000  36.000000   36.00000      36.00000
#>   3:   12.00000  12.000000   12.00000  12.000000   12.00000      12.00000
#>   4:   18.00000  18.000000   18.00000  18.000000   18.00000      18.00000
#>   5:   21.37254   5.262766    5.72173   5.127981   13.66949      15.16972
#>  ---                                                                     
#> 149:   30.00000  30.000000   30.00000  30.000000   30.00000      30.00000
#> 150:   12.19346  29.142736   20.93322  24.046683   23.53701      23.56069
#> 151:   14.00000  14.000000   14.00000  14.000000   14.00000      14.00000
#> 152:   18.00000  18.000000   18.00000  18.000000   18.00000      18.00000
#> 153:   20.00000  20.000000   20.00000  20.000000   20.00000      20.00000
#>      Ozone_chac_imp
#>   1:        (40,60]
#>   2:        (20,40]
#>   3:         (0,20]
#>   4:         (0,20]
#>   5:           <NA>
#>  ---               
#> 149:        (20,40]
#> 150:           <NA>
#> 151:         (0,20]
#> 152:         (0,20]
#> 153:         (0,20]

# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#>    Ozone Solar.R Wind Temp Day Intercept index   weights groups x_character
#> 1:    NA      NA 14.3   56   5         1     5 0.9945367      5        <NA>
#> 2:    NA     194  8.6   69  10         1    10 0.9950548      5   (140,210]
#> 3:    NA      66 16.6   57  25         1    25 1.0124095      5      (0,70]
#> 4:    NA     266 14.9   58  26         1    26 1.0047912      5   (210,280]
#> 5:    NA      NA  8.0   57  27         1    27 0.9998296      5        <NA>
#>    Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#> 1:       <NA>    <NA>         NA    93.38071          (0,70]   39.86381
#> 2:       <NA>    <NA>         NA   194.00000       (140,210]   50.32905
#> 3:       <NA>    <NA>         NA    66.00000          (0,70]   70.39088
#> 4:       <NA>    <NA>         NA   266.00000       (210,280]   80.25469
#> 5:       <NA>    <NA>         NA   128.47330          (0,70]   77.42150
#>    Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#> 1:  21.372536   5.262766   5.721730   5.127981   13.66949      15.16972
#> 2:  42.128453  23.343879  22.115402  20.388899   26.05740      30.72718
#> 3:  20.857649   4.483203   5.502857   5.311351   14.23572      20.13028
#> 4:   7.254263  10.940727   9.095296   5.663351   14.23572      21.24067
#> 5:   4.876854   7.116634   6.480032   6.391228   17.52725      19.96892
#>    Ozone_chac_imp
#> 1:           <NA>
#> 2:           <NA>
#> 3:           <NA>
#> 4:           <NA>
#> 5:           <NA>