kknn
kknn copied to clipboard
train.kknn() fails on this data set
Here is a data set on which train.knn()
fails:
args <- list(formula = target ~ ., data = structure(list(V1 = c(270.093653981481,
279.505245324074, 269.973504027778, 272.901858449074), V2 = c(272.251986666667,
263.08405, 272.924066666667, 278.41284), V3 = c(271.785323333333,
262.78406, 272.524073333333, 277.772833333333), V4 = c(272.018636666667,
263.08405, 272.757393333333, 277.486173333333), V5 = c(272.25199,
263.38406, 272.890733333333, 277.096166666667), V6 = c(271.785323333333,
262.78406, 272.390733333333, 277.18616), V7 = c(271.91864, 262.98407,
272.4574, 276.839506666667), V8 = c(272.051973333333, 263.28406,
272.59074, 276.626166666667), V9 = c(271.55198, 262.38406, 272.024073333333,
276.269506666667), V10 = c(271.68532, 262.68405, 272.157413333333,
276.019493333333), V11 = c(271.91864, 263.08405, 272.290726666667,
275.92616), V12 = c(272.651983333333, 263.48407, 273.257393333333,
278.152833333333), V13 = c(272.885303333333, 264.18405, 273.49074,
277.609506666667), V14 = c(272.151983333333, 263.08405, 272.890733333333,
278.1795), V15 = c(272.485313333333, 263.68405, 273.157413333333,
277.792833333333), V16 = c(272.818656666667, 263.78406, 273.49074,
277.526166666667), V17 = c(272.05198, 263.18405, 272.824066666667,
277.819493333333), V18 = c(272.318656666667, 263.78406, 273.057413333333,
277.442833333333), V19 = c(272.718656666667, 263.78406, 273.290726666667,
277.31284), V20 = c(272.7428, 268.4415, 273.31012, 279.466216666667
), V21 = c(272.316113333333, 266.9915, 272.953453333333, 278.729526666667
), V22 = c(272.626133333333, 267.26147, 273.210133333333, 278.232876666667
), V23 = c(272.726113333333, 267.76147, 273.28346, 277.552876666667
), V24 = c(272.2628, 266.7115, 272.920126666667, 277.812876666667
), V25 = c(272.492786666667, 266.9115, 273.086786666667, 277.316213333333
), V26 = c(272.579446666667, 267.3415, 273.110133333333, 276.89288
), V27 = c(272.099446666667, 266.36148, 272.676786666667, 276.769553333333
), V28 = c(272.296133333333, 266.55148, 272.880126666667, 276.31955
), V29 = c(272.39611, 266.85147, 272.906786666667, 276.13288),
V30 = c(273.06945, 268.7415, 273.610133333333, 279.082886666667
), V31 = c(273.202793333333, 268.82147, 273.783466666667,
278.126206666667), V32 = c(272.606106666667, 268.02148, 273.2368,
279.17288), V33 = c(272.81945, 268.23148, 273.470133333333,
278.402873333333), V34 = c(273.222786666667, 268.85147, 273.813466666667,
277.992866666667), V35 = c(272.526126666667, 267.60147, 273.180133333333,
278.68954), V36 = c(272.65278, 267.79147, 273.3268, 277.78286
), V37 = c(273.07945, 268.5315, 273.663466666667, 277.686206666667
), V38 = c(272.35208, 268.547208333333, 272.98351, 276.773517222222
), V39 = c(271.919287777778, 266.888871666667, 272.522400555556,
276.060173333333), V40 = c(272.198186666667, 267.080536666667,
272.846842222222, 275.789623888889), V41 = c(272.285394444444,
267.532205, 272.974061666667, 275.444076111111), V42 = c(271.83708,
266.605541666667, 272.473509444444, 275.472956111111), V43 = c(272.047065555556,
266.725548333333, 272.721287777778, 275.296294444444), V44 = c(272.14762,
267.09221, 272.783520555556, 275.118516666667), V45 = c(271.655945555556,
266.302203333333, 272.221287777778, 274.861854444444), V46 = c(271.849853333333,
266.41887, 272.456836111111, 274.800739444444), V47 = c(271.969842777778,
266.612201666667, 272.529621111111, 274.771848888889), V48 = c(272.667068333333,
268.820548333333, 273.275183888889, 276.432408888889), V49 = c(272.812626666667,
268.942196666667, 273.412401111111, 275.924067777778), V50 = c(272.255948888889,
268.047198333333, 272.902955, 276.460183333333), V51 = c(272.470393888889,
268.295543333333, 273.150182222222, 275.97352), V52 = c(272.808732222222,
268.768863333333, 273.414064444444, 275.861843333333), V53 = c(272.167622222222,
267.568873333333, 272.843507222222, 276.070181111111), V54 = c(272.287625555556,
267.79054, 273.027958333333, 275.573514444444), V55 = c(272.65373,
268.367206666667, 273.282409444444, 275.614634444444), V56 = c(271.765283333333,
263.975515, 272.534132777778, 276.21454), V57 = c(271.320848333333,
263.208858333333, 272.050807222222, 275.524530555556), V58 = c(271.526395,
263.475513333333, 272.289693888889, 275.404539444444), V59 = c(271.687515,
263.70886, 272.395252222222, 275.172864444444), V60 = c(271.270839444444,
263.058845, 271.945243888889, 275.098974444444), V61 = c(271.426391111111,
263.275516666667, 272.10636, 274.957872222222), V62 = c(271.537506666667,
263.542185, 272.161918333333, 274.922308888889), V63 = c(271.054173333333,
262.808846666667, 271.584137222222, 274.496197222222), V64 = c(271.215284444444,
263.058843333333, 271.756363888889, 274.504533333333), V65 = c(271.426391111111,
263.325511666667, 271.878584444444, 274.583413333333), V66 = c(272.115292777778,
264.175521666667, 272.839692222222, 275.871197222222), V67 = c(272.376390555556,
264.70885, 273.045248333333, 275.580645555556), V68 = c(271.715282777778,
263.775518333333, 272.478578888889, 275.903424444444), V69 = c(272.026396666667,
264.225521666667, 272.773032222222, 275.561202222222), V70 = c(272.237515,
264.308853333333, 273.011913333333, 275.555646666667), V71 = c(271.62084,
263.692186666667, 272.400802777778, 275.565636666667), V72 = c(271.881960555556,
264.158851666667, 272.639697222222, 275.308424444444), V73 = c(272.137515,
264.175518333333, 272.828584444444, 275.342313333333), V74 = c(61,
13, 37, 61), V75 = c(2, 0, 1, 2), target = c(5.407, 5.73,
5.407, 5.303), V77 = c(6.352, 6.388, 6.352, 6.339), V78 = c(-0.0909756944444445,
-0.107152777777778, -0.110204861111111, -0.111579861111111
)), row.names = 3:6, class = "data.frame"), kmax = 3L, kernel = "rectangular")
library(kknn)
do.call(train.kknn, args)
# Error in best[1, 2] : subscript out of bounds
The error is happening because of the return value of dmEuclid
in the C code. When I step through in the R debugger, I see this:
Browse[2]> dmtmp$cl
[1] 0 1 2 2 2 0 0 0
[9] 1 2 1 1 32676 32676 32676 32676
[17] 1001200320 1001200320 1001200320 3
I'm guessing this is either uninitialized memory or an overflow/underflow problem.
Hi @KlausVigo , any thoughts on this?
Hi @kenahoo , you are right, this looks like an underflow or overflow problem. What system are you using, maybe you can send me the output of sessionInfo() ?
On my machine your code is working, so I can't really reproduce your problem.
...
kmax = 3L, kernel = "rectangular")
Type of response variable: continuous
minimal mean absolute error: 0.10675
Minimal mean squared error: 0.02878625
Best kernel: rectangular
Best k: 1
Regards, Klaus
Hi Klaus,
Actually, it seems to be non-deterministic. Here are a few runs in a row:
> do.call(train.kknn, args)
Error in cl[C] : only 0's may be mixed with negative subscripts
> do.call(train.kknn, args)
Error in best[1, 2] : subscript out of bounds
> do.call(train.kknn, args)
Error in best[1, 2] : subscript out of bounds
> do.call(train.kknn, args)
Error in cl[C] : only 0's may be mixed with negative subscripts
> do.call(train.kknn, args)
Error in best[1, 2] : subscript out of bounds
> do.call(train.kknn, args)
Call:
(function (formula, data, kmax = 11, ks = NULL, distance = 2, kernel = "optimal", ykernel = NULL, scale = TRUE, contrasts = c(unordered = "contr.dummy", ordered = "contr.ordinal"), ...) { if (is.null(ykernel)) ykernel = 0 weight.y = function(l = 1, diff = 0) { k = diff + 1 result = matrix(0, l, l)
# ... (that one was successful)
My session info:
> sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-apple-darwin17.7.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblasp-r0.3.5.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] kknn_1.3.1
loaded via a namespace (and not attached):
[1] compiler_3.5.2 magrittr_1.5 Matrix_1.2-15 igraph_1.2.2
[5] grid_3.5.2 pkgconfig_2.0.2 lattice_0.20-38
I took R (mostly) out of the chain, and it looks like it's non-deterministic at the C level. Stepping through with the lldb
debugger, I see this:
(lldb) p dmEuclid(mm_data, mm_data, &m, &p, &q, dm, cl, &k, we)
(lldb) p cl
(int [20]) $2 = {
[0] = 0
[1] = 1
[2] = 2
[3] = 3
[4] = 2
[5] = 0
[6] = 0
[7] = 2
[8] = 3
[9] = 2
[10] = 3
[11] = 0
[12] = 1
[13] = 3
[14] = 1
[15] = 1
[16] = 0
[17] = 1
[18] = 3
[19] = 1
}
(lldb) p dmEuclid(mm_data, mm_data, &m, &p, &q, dm, cl, &k, we)
(lldb) p cl
(int [20]) $3 = {
[0] = 0
[1] = 1
[2] = 2
[3] = 3
[4] = 2
[5] = 0
[6] = 0
[7] = 2
[8] = 3
[9] = 2
[10] = 3
[11] = 0
[12] = 1
[13] = 3
[14] = 1
[15] = 1
[16] = -1610612736
[17] = 1
[18] = 3
[19] = 1
}
I also noticed some mismatches between the order of argument names that train.kknn
is passing, and what dmEuclid
accepts:
.C("dmEuclid",
as.double(mm.data),
as.double(mm.data),
as.integer(m), <---- different order
as.integer(p), <---- different order
as.integer(q), <---- different order
dm = double((kmax + 2L) * p),
cl = integer((kmax + 2L) * p),
k = as.integer(kmax + 2),
as.double(we),
PACKAGE = "kknn")
void dmEuclid(double *learn,
double *valid,
int *n, <---- different order
int *m, <---- different order
int *p, <---- different order
double *dm,
int *cl,
int *k,
double *weights)
Any chance this is the issue?
Hi @kenahoo,
I should rename the parameters to correspond in the R and C code.
I think I finally figured it out. I kind of assumed that kmax + 2
is smaller the number of observations.
It should be now to work if kmax + 1
is equal to the number of observations and decreases kmax otherwise.
Can you check if my last commit fixes your problem?
Hi @KlausVigo , I think that fixes the problem indeed.
Curiously, I could only get this problem to manifest when running R interactively, not through Rscript. Maybe memory initialization is different in that case or something.
Thanks.