kknn train.kknn() fails on this data set

Here is a data set on which train.knn() fails:

args <- list(formula = target ~ ., data = structure(list(V1 = c(270.093653981481, 
279.505245324074, 269.973504027778, 272.901858449074), V2 = c(272.251986666667, 
263.08405, 272.924066666667, 278.41284), V3 = c(271.785323333333, 
262.78406, 272.524073333333, 277.772833333333), V4 = c(272.018636666667, 
263.08405, 272.757393333333, 277.486173333333), V5 = c(272.25199, 
263.38406, 272.890733333333, 277.096166666667), V6 = c(271.785323333333, 
262.78406, 272.390733333333, 277.18616), V7 = c(271.91864, 262.98407, 
272.4574, 276.839506666667), V8 = c(272.051973333333, 263.28406, 
272.59074, 276.626166666667), V9 = c(271.55198, 262.38406, 272.024073333333, 
276.269506666667), V10 = c(271.68532, 262.68405, 272.157413333333, 
276.019493333333), V11 = c(271.91864, 263.08405, 272.290726666667, 
275.92616), V12 = c(272.651983333333, 263.48407, 273.257393333333, 
278.152833333333), V13 = c(272.885303333333, 264.18405, 273.49074, 
277.609506666667), V14 = c(272.151983333333, 263.08405, 272.890733333333, 
278.1795), V15 = c(272.485313333333, 263.68405, 273.157413333333, 
277.792833333333), V16 = c(272.818656666667, 263.78406, 273.49074, 
277.526166666667), V17 = c(272.05198, 263.18405, 272.824066666667, 
277.819493333333), V18 = c(272.318656666667, 263.78406, 273.057413333333, 
277.442833333333), V19 = c(272.718656666667, 263.78406, 273.290726666667, 
277.31284), V20 = c(272.7428, 268.4415, 273.31012, 279.466216666667
), V21 = c(272.316113333333, 266.9915, 272.953453333333, 278.729526666667
), V22 = c(272.626133333333, 267.26147, 273.210133333333, 278.232876666667
), V23 = c(272.726113333333, 267.76147, 273.28346, 277.552876666667
), V24 = c(272.2628, 266.7115, 272.920126666667, 277.812876666667
), V25 = c(272.492786666667, 266.9115, 273.086786666667, 277.316213333333
), V26 = c(272.579446666667, 267.3415, 273.110133333333, 276.89288
), V27 = c(272.099446666667, 266.36148, 272.676786666667, 276.769553333333
), V28 = c(272.296133333333, 266.55148, 272.880126666667, 276.31955
), V29 = c(272.39611, 266.85147, 272.906786666667, 276.13288), 
    V30 = c(273.06945, 268.7415, 273.610133333333, 279.082886666667
    ), V31 = c(273.202793333333, 268.82147, 273.783466666667, 
    278.126206666667), V32 = c(272.606106666667, 268.02148, 273.2368, 
    279.17288), V33 = c(272.81945, 268.23148, 273.470133333333, 
    278.402873333333), V34 = c(273.222786666667, 268.85147, 273.813466666667, 
    277.992866666667), V35 = c(272.526126666667, 267.60147, 273.180133333333, 
    278.68954), V36 = c(272.65278, 267.79147, 273.3268, 277.78286
    ), V37 = c(273.07945, 268.5315, 273.663466666667, 277.686206666667
    ), V38 = c(272.35208, 268.547208333333, 272.98351, 276.773517222222
    ), V39 = c(271.919287777778, 266.888871666667, 272.522400555556, 
    276.060173333333), V40 = c(272.198186666667, 267.080536666667, 
    272.846842222222, 275.789623888889), V41 = c(272.285394444444, 
    267.532205, 272.974061666667, 275.444076111111), V42 = c(271.83708, 
    266.605541666667, 272.473509444444, 275.472956111111), V43 = c(272.047065555556, 
    266.725548333333, 272.721287777778, 275.296294444444), V44 = c(272.14762, 
    267.09221, 272.783520555556, 275.118516666667), V45 = c(271.655945555556, 
    266.302203333333, 272.221287777778, 274.861854444444), V46 = c(271.849853333333, 
    266.41887, 272.456836111111, 274.800739444444), V47 = c(271.969842777778, 
    266.612201666667, 272.529621111111, 274.771848888889), V48 = c(272.667068333333, 
    268.820548333333, 273.275183888889, 276.432408888889), V49 = c(272.812626666667, 
    268.942196666667, 273.412401111111, 275.924067777778), V50 = c(272.255948888889, 
    268.047198333333, 272.902955, 276.460183333333), V51 = c(272.470393888889, 
    268.295543333333, 273.150182222222, 275.97352), V52 = c(272.808732222222, 
    268.768863333333, 273.414064444444, 275.861843333333), V53 = c(272.167622222222, 
    267.568873333333, 272.843507222222, 276.070181111111), V54 = c(272.287625555556, 
    267.79054, 273.027958333333, 275.573514444444), V55 = c(272.65373, 
    268.367206666667, 273.282409444444, 275.614634444444), V56 = c(271.765283333333, 
    263.975515, 272.534132777778, 276.21454), V57 = c(271.320848333333, 
    263.208858333333, 272.050807222222, 275.524530555556), V58 = c(271.526395, 
    263.475513333333, 272.289693888889, 275.404539444444), V59 = c(271.687515, 
    263.70886, 272.395252222222, 275.172864444444), V60 = c(271.270839444444, 
    263.058845, 271.945243888889, 275.098974444444), V61 = c(271.426391111111, 
    263.275516666667, 272.10636, 274.957872222222), V62 = c(271.537506666667, 
    263.542185, 272.161918333333, 274.922308888889), V63 = c(271.054173333333, 
    262.808846666667, 271.584137222222, 274.496197222222), V64 = c(271.215284444444, 
    263.058843333333, 271.756363888889, 274.504533333333), V65 = c(271.426391111111, 
    263.325511666667, 271.878584444444, 274.583413333333), V66 = c(272.115292777778, 
    264.175521666667, 272.839692222222, 275.871197222222), V67 = c(272.376390555556, 
    264.70885, 273.045248333333, 275.580645555556), V68 = c(271.715282777778, 
    263.775518333333, 272.478578888889, 275.903424444444), V69 = c(272.026396666667, 
    264.225521666667, 272.773032222222, 275.561202222222), V70 = c(272.237515, 
    264.308853333333, 273.011913333333, 275.555646666667), V71 = c(271.62084, 
    263.692186666667, 272.400802777778, 275.565636666667), V72 = c(271.881960555556, 
    264.158851666667, 272.639697222222, 275.308424444444), V73 = c(272.137515, 
    264.175518333333, 272.828584444444, 275.342313333333), V74 = c(61, 
    13, 37, 61), V75 = c(2, 0, 1, 2), target = c(5.407, 5.73, 
    5.407, 5.303), V77 = c(6.352, 6.388, 6.352, 6.339), V78 = c(-0.0909756944444445, 
    -0.107152777777778, -0.110204861111111, -0.111579861111111
    )), row.names = 3:6, class = "data.frame"), kmax = 3L, kernel = "rectangular")

library(kknn)
do.call(train.kknn, args)

# Error in best[1, 2] : subscript out of bounds

The error is happening because of the return value of dmEuclid in the C code. When I step through in the R debugger, I see this:

Browse[2]> dmtmp$cl
 [1]          0          1          2          2          2          0          0          0
 [9]          1          2          1          1      32676      32676      32676      32676
[17] 1001200320 1001200320 1001200320          3

I'm guessing this is either uninitialized memory or an overflow/underflow problem.

Mar 15 '19 00:03 kenahoo

Hi @KlausVigo , any thoughts on this?

Mar 19 '19 18:03 kenahoo

Hi @kenahoo , you are right, this looks like an underflow or overflow problem. What system are you using, maybe you can send me the output of sessionInfo() ?

On my machine your code is working, so I can't really reproduce your problem.

...
 kmax = 3L, kernel = "rectangular")

Type of response variable: continuous
minimal mean absolute error: 0.10675
Minimal mean squared error: 0.02878625
Best kernel: rectangular
Best k: 1

Regards, Klaus

Mar 19 '19 22:03 KlausVigo

Hi Klaus,

Actually, it seems to be non-deterministic. Here are a few runs in a row:

> do.call(train.kknn, args)
Error in cl[C] : only 0's may be mixed with negative subscripts
> do.call(train.kknn, args)
Error in best[1, 2] : subscript out of bounds
> do.call(train.kknn, args)
Error in best[1, 2] : subscript out of bounds
> do.call(train.kknn, args)
Error in cl[C] : only 0's may be mixed with negative subscripts
> do.call(train.kknn, args)
Error in best[1, 2] : subscript out of bounds
> do.call(train.kknn, args)

Call:
(function (formula, data, kmax = 11, ks = NULL, distance = 2,     kernel = "optimal", ykernel = NULL, scale = TRUE, contrasts = c(unordered = "contr.dummy",         ordered = "contr.ordinal"), ...) {    if (is.null(ykernel))         ykernel = 0    weight.y = function(l = 1, diff = 0) {        k = diff + 1        result = matrix(0, l, l)  
# ... (that one was successful)

My session info:

> sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-apple-darwin17.7.0 (64-bit)
Running under: macOS High Sierra 10.13.6

Matrix products: default
BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblasp-r0.3.5.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] kknn_1.3.1

loaded via a namespace (and not attached):
[1] compiler_3.5.2  magrittr_1.5    Matrix_1.2-15   igraph_1.2.2   
[5] grid_3.5.2      pkgconfig_2.0.2 lattice_0.20-38

Mar 20 '19 14:03 kenahoo

I took R (mostly) out of the chain, and it looks like it's non-deterministic at the C level. Stepping through with the lldb debugger, I see this:

(lldb) p dmEuclid(mm_data, mm_data, &m, &p, &q, dm, cl, &k, we)
(lldb) p cl
(int [20]) $2 = {
  [0] = 0
  [1] = 1
  [2] = 2
  [3] = 3
  [4] = 2
  [5] = 0
  [6] = 0
  [7] = 2
  [8] = 3
  [9] = 2
  [10] = 3
  [11] = 0
  [12] = 1
  [13] = 3
  [14] = 1
  [15] = 1
  [16] = 0
  [17] = 1
  [18] = 3
  [19] = 1
}
(lldb) p dmEuclid(mm_data, mm_data, &m, &p, &q, dm, cl, &k, we)
(lldb) p cl
(int [20]) $3 = {
  [0] = 0
  [1] = 1
  [2] = 2
  [3] = 3
  [4] = 2
  [5] = 0
  [6] = 0
  [7] = 2
  [8] = 3
  [9] = 2
  [10] = 3
  [11] = 0
  [12] = 1
  [13] = 3
  [14] = 1
  [15] = 1
  [16] = -1610612736
  [17] = 1
  [18] = 3
  [19] = 1
}

I also noticed some mismatches between the order of argument names that train.kknn is passing, and what dmEuclid accepts:

.C("dmEuclid",
   as.double(mm.data),
   as.double(mm.data), 
   as.integer(m),  <---- different order
   as.integer(p),  <---- different order
   as.integer(q),  <---- different order
   dm = double((kmax + 2L) * p),
   cl = integer((kmax + 2L) * p), 
   k = as.integer(kmax + 2),
   as.double(we),
   PACKAGE = "kknn")

void dmEuclid(double *learn,
              double *valid,
              int *n,   <---- different order
              int *m,   <---- different order
              int *p,   <---- different order
              double *dm,
              int *cl,
              int *k,
              double *weights)

Any chance this is the issue?

Mar 20 '19 16:03 kenahoo

Hi @kenahoo, I should rename the parameters to correspond in the R and C code. I think I finally figured it out. I kind of assumed that kmax + 2 is smaller the number of observations.
It should be now to work if kmax + 1 is equal to the number of observations and decreases kmax otherwise. Can you check if my last commit fixes your problem?

Mar 20 '19 18:03 KlausVigo

Hi @KlausVigo , I think that fixes the problem indeed.

Curiously, I could only get this problem to manifest when running R interactively, not through Rscript. Maybe memory initialization is different in that case or something.

Thanks.

Mar 20 '19 22:03 kenahoo

kknn kknn copied to clipboard

train.kknn() fails on this data set

kknn
kknn copied to clipboard