Regular imputations to fill the missing data. Non missing independent variables are used to approximate a missing observations for a dependent variable. Quantitative models were built under Rcpp packages and the C++ library Armadillo.

fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

# S3 method for class 'data.frame'
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

# S3 method for class 'data.table'
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

# S3 method for class 'matrix'
fill_NA(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-06)

Arguments

x

a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables

model

a character - posibble options ("lda","lm_pred","lm_bayes","lm_noise")

posit_y

an integer/character - a position/name of dependent variable

posit_x

an integer/character vector - positions/names of independent variables

w

a numeric vector - a weighting variable - only positive values, Default:NULL

logreg

a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE

ridge

a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5

Value

load imputations in a numeric/logical/character/factor (similar to the input type) vector format

Methods (by class)

  • fill_NA(data.frame): S3 method for data.frame

  • fill_NA(data.table): s3 method for data.table

  • fill_NA(matrix): S3 method for matrix

Note

There is assumed that users add the intercept by their own. The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. The lda model is assessed only if there are more than 15 complete observations and for the lms models if number of independent variables is smaller than number of observations.

See also

Examples

library(miceFast)
library(dplyr)
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
library(data.table)
#> 
#> Attaching package: ‘data.table’
#> The following objects are masked from ‘package:dplyr’:
#> 
#>     between, first, last
### Data
# airquality dataset with additional variables
data(air_miss)
### Intro: dplyr
# IMPUTATIONS
air_miss <- air_miss %>%
  # Imputations with a grouping option (models are separately assessed for each group)
  # taking into account provided weights
  group_by(groups) %>%
  do(mutate(., Solar_R_imp = fill_NA(
    x = .,
    model = "lm_pred",
    posit_y = "Solar.R",
    posit_x = c("Wind", "Temp", "Intercept"),
    w = .[["weights"]]
  ))) %>%
  ungroup() %>%
  # Imputations - discrete variable
  mutate(x_character_imp = fill_NA(
    x = .,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp")
  )) %>%
  # logreg was used because almost log-normal distribution of Ozone
  # imputations around mean
  mutate(Ozone_imp1 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )) %>%
  # imputations using positions - Intercept, Temp
  mutate(Ozone_imp2 = fill_NA(
    x = .,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )) %>%
  # multiple imputations (average of x30 imputations)
  # with a factor independent variable, weights and logreg options
  mutate(Ozone_imp3 = fill_NA_N(
    x = .,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  mutate(Ozone_imp4 = fill_NA_N(
    x = .,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 30
  )) %>%
  group_by(groups) %>%
  do(mutate(., Ozone_imp5 = fill_NA(
    x = .,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE
  ))) %>%
  do(mutate(., Ozone_imp6 = fill_NA_N(
    x = .,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .[["weights"]],
    logreg = TRUE,
    k = 20
  ))) %>%
  ungroup() %>%
  # Average of a few methods
  mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
  # Protecting against collinearity or low number of observations - across small groups
  # Be carful when using a grouping option
  # because of lack of protection against collinearity or low number of observations.
  # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
  group_by(groups) %>%
  do(mutate(., Ozone_chac_imp = tryCatch(
    fill_NA(
      x = .,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c(
        "Intercept",
        "Month",
        "Day",
        "Temp",
        "x_character_imp"
      ),
      w = .[["weights"]]
    ),
    error = function(e) .[["Ozone_chac"]]
  ))) %>%
  ungroup()

# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#> # A tibble: 5 × 23
#>   Ozone Solar.R  Wind  Temp   Day Intercept index weights groups x_character
#>   <dbl>   <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl>   <dbl> <fct>  <chr>      
#> 1    NA      NA  14.3    56     5         1     5   0.995 5      NA         
#> 2    NA     194   8.6    69    10         1    10   0.995 5      (140,210]  
#> 3    NA      66  16.6    57    25         1    25   1.01  5      (0,70]     
#> 4    NA     266  14.9    58    26         1    26   1.00  5      (210,280]  
#> 5    NA      NA   8      57    27         1    27   1.00  5      NA         
#> # ℹ 13 more variables: Ozone_chac <chr>, Ozone_f <fct>, Ozone_high <lgl>,
#> #   Solar_R_imp <dbl>, x_character_imp <chr>, Ozone_imp1 <dbl>,
#> #   Ozone_imp2 <dbl>, Ozone_imp3 <dbl>, Ozone_imp4 <dbl>, Ozone_imp5 <dbl>,
#> #   Ozone_imp6 <dbl>, Ozone_imp_mix <dbl>, Ozone_chac_imp <chr>

### Intro: data.table
# IMPUTATIONS
# Imputations with a grouping option (models are separately assessed for each group)
# taking into account provided weights
data(air_miss)
setDT(air_miss)
air_miss[, Solar_R_imp := fill_NA_N(
  x = .SD,
  model = "lm_bayes",
  posit_y = "Solar.R",
  posit_x = c("Wind", "Temp", "Intercept"),
  w = .SD[["weights"]],
  k = 100
), by = .(groups)] %>%
  # Imputations - discrete variable
  .[, x_character_imp := fill_NA(
    x = .SD,
    model = "lda",
    posit_y = "x_character",
    posit_x = c("Wind", "Temp", "groups")
  )] %>%
  # logreg was used because almost log-normal distribution of Ozone
  # imputations around mean
  .[, Ozone_imp1 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept"),
    logreg = TRUE
  )] %>%
  # imputations using positions - Intercept, Temp
  .[, Ozone_imp2 := fill_NA(
    x = .SD,
    model = "lm_bayes",
    posit_y = 1,
    posit_x = c(4, 6),
    logreg = TRUE
  )] %>%
  # model with a factor independent variable
  # multiple imputations (average of x30 imputations)
  # with a factor independent variable, weights and logreg options
  .[, Ozone_imp3 := fill_NA_N(
    x = .SD,
    model = "lm_noise",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp4 := fill_NA_N(
    x = .SD,
    model = "lm_bayes",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 30
  )] %>%
  .[, Ozone_imp5 := fill_NA(
    x = .SD,
    model = "lm_pred",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE
  ), .(groups)] %>%
  .[, Ozone_imp6 := fill_NA_N(
    x = .SD,
    model = "pmm",
    posit_y = "Ozone",
    posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
    w = .SD[["weights"]],
    logreg = TRUE,
    k = 10
  ), .(groups)] %>%
  # Average of a few methods
  .[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
  # Protecting against collinearity or low number of observations - across small groups
  # Be carful when using a data.table grouping option
  # because of lack of protection against collinearity or low number of observations.
  # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))

  .[, Ozone_chac_imp := tryCatch(
    fill_NA(
      x = .SD,
      model = "lda",
      posit_y = "Ozone_chac",
      posit_x = c(
        "Intercept",
        "Month",
        "Day",
        "Temp",
        "x_character_imp"
      ),
      w = .SD[["weights"]]
    ),
    error = function(e) .SD[["Ozone_chac"]]
  ), .(groups)]
#>      Ozone Solar.R  Wind  Temp   Day Intercept index   weights groups
#>      <num>   <num> <num> <num> <num>     <num> <num>     <num> <fctr>
#>   1:    41     190   7.4    67     1         1     1 1.0186350      5
#>   2:    36     118   8.0    72     2         1     2 1.0107583      5
#>   3:    12     149  12.6    74     3         1     3 0.9891023      5
#>   4:    18     313  11.5    62     4         1     4 0.9913450      5
#>   5:    NA      NA  14.3    56     5         1     5 0.9945367      5
#>  ---                                                                 
#> 149:    30     193   6.9    70    26         1   149 0.9985280      9
#> 150:    NA     145  13.2    77    27         1   150 1.0001786      9
#> 151:    14     191  14.3    75    28         1   151 1.0024673      9
#> 152:    18     131   8.0    76    29         1   152 0.9968826      9
#> 153:    20     223  11.5    68    30         1   153 1.0056592      9
#>      x_character Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp
#>           <char>     <char>  <fctr>     <lgcl>       <num>          <char>
#>   1:   (140,210]    (40,60] (40,60]      FALSE    190.0000       (140,210]
#>   2:    (70,140]    (20,40] (20,40]      FALSE    118.0000        (70,140]
#>   3:   (140,210]     (0,20]  (0,20]      FALSE    149.0000       (140,210]
#>   4:   (280,350]     (0,20]  (0,20]      FALSE    313.0000       (280,350]
#>   5:        <NA>       <NA>    <NA>         NA    109.0075          (0,70]
#>  ---                                                                      
#> 149:   (140,210]    (20,40] (20,40]      FALSE    193.0000       (140,210]
#> 150:   (140,210]       <NA>    <NA>         NA    145.0000       (140,210]
#> 151:   (140,210]     (0,20]  (0,20]      FALSE    191.0000       (140,210]
#> 152:    (70,140]     (0,20]  (0,20]      FALSE    131.0000        (70,140]
#> 153:   (210,280]     (0,20]  (0,20]      FALSE    223.0000       (210,280]
#>      Ozone_imp1 Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6
#>           <num>      <num>      <num>      <num>      <num>      <num>
#>   1:   41.00000  41.000000  41.000000  41.000000  41.000000   41.00000
#>   2:   36.00000  36.000000  36.000000  36.000000  36.000000   36.00000
#>   3:   12.00000  12.000000  12.000000  12.000000  12.000000   12.00000
#>   4:   18.00000  18.000000  18.000000  18.000000  18.000000   18.00000
#>   5:   37.27863   6.707116   5.428128   5.534204   5.127981   13.66949
#>  ---                                                                  
#> 149:   30.00000  30.000000  30.000000  30.000000  30.000000   30.00000
#> 150:   30.33189  61.381910  23.171726  28.634885  24.046683   25.46112
#> 151:   14.00000  14.000000  14.000000  14.000000  14.000000   14.00000
#> 152:   18.00000  18.000000  18.000000  18.000000  18.000000   18.00000
#> 153:   20.00000  20.000000  20.000000  20.000000  20.000000   20.00000
#>      Ozone_imp_mix Ozone_chac_imp
#>              <num>         <char>
#>   1:      41.00000        (40,60]
#>   2:      36.00000        (20,40]
#>   3:      12.00000         (0,20]
#>   4:      18.00000         (0,20]
#>   5:      12.29093           <NA>
#>  ---                             
#> 149:      30.00000        (20,40]
#> 150:      32.17137           <NA>
#> 151:      14.00000         (0,20]
#> 152:      18.00000         (0,20]
#> 153:      20.00000         (0,20]

# Sample of results
air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#>    Ozone Solar.R  Wind  Temp   Day Intercept index   weights groups x_character
#>    <num>   <num> <num> <num> <num>     <num> <num>     <num> <fctr>      <char>
#> 1:    NA      NA  14.3    56     5         1     5 0.9945367      5        <NA>
#> 2:    NA     194   8.6    69    10         1    10 0.9950548      5   (140,210]
#> 3:    NA      66  16.6    57    25         1    25 1.0124095      5      (0,70]
#> 4:    NA     266  14.9    58    26         1    26 1.0047912      5   (210,280]
#> 5:    NA      NA   8.0    57    27         1    27 0.9998296      5        <NA>
#>    Ozone_chac Ozone_f Ozone_high Solar_R_imp x_character_imp Ozone_imp1
#>        <char>  <fctr>     <lgcl>       <num>          <char>      <num>
#> 1:       <NA>    <NA>         NA    109.0075          (0,70]   37.27863
#> 2:       <NA>    <NA>         NA    194.0000       (140,210]   59.40327
#> 3:       <NA>    <NA>         NA     66.0000          (0,70]   39.00817
#> 4:       <NA>    <NA>         NA    266.0000       (210,280]  156.31780
#> 5:       <NA>    <NA>         NA    112.2568          (0,70]   19.56298
#>    Ozone_imp2 Ozone_imp3 Ozone_imp4 Ozone_imp5 Ozone_imp6 Ozone_imp_mix
#>         <num>      <num>      <num>      <num>      <num>         <num>
#> 1:   6.707116   5.428128   5.534204   5.127981   13.66949      12.29093
#> 2:  10.930479  27.271427  27.102997  20.388899   14.38842      26.58092
#> 3:   4.694893   5.513273   4.548383   5.311351   12.82412      11.98336
#> 4:  18.047066  11.236547   8.884315   5.663351   15.18932      35.88973
#> 5:  16.322812   9.198472   8.382237   6.391228   13.66949      12.25454
#>    Ozone_chac_imp
#>            <char>
#> 1:           <NA>
#> 2:           <NA>
#> 3:           <NA>
#> 4:           <NA>
#> 5:           <NA>