spark-MDLP-discretization
spark-MDLP-discretization copied to clipboard
DiscretizerModelReader::load fails with Spark 2.1.1
Steps to reproduce with this dataset
spark-shell --jars "/path/to/spark-MDLP-discretization-1.3.jar"
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
val carsPath = "/path/to/cars.data"
val mdlPath = "/path/to/save/mdl"
val df = spark.read.option("header", true)
.option("inferSchema", true)
.csv(carsPath)
val strId = new StringIndexer().setInputCol("v7").setOutputCol("v7_IDX")
val assmbleFeatures: VectorAssembler = new VectorAssembler()
.setInputCols(Array("v0", "v1", "v2", "v3", "v4", "v5", "v6"))
.setOutputCol("featuresRaw")
val dis: MDLPDiscretizer = new MDLPDiscretizer()
.setInputCol("featuresRaw")
.setOutputCol("featuresBucket")
.setLabelCol("v7_IDX")
.setMaxBins(10)
.setMaxByPart(10000)
.setMinBinPercentage(0.01)
val plm = new Pipeline()
.setStages(Array(strId, assmbleFeatures, dis))
.fit(df)
plm.write.overwrite.save(mdlPath)
PipelineModel.load(mdlPath)
Gives:
scala.MatchError: [WrappedArray(WrappedArray(-Infinity, 21.05, Infinity), WrappedArray(-Infinity, 5.5, Infinity), WrappedArray(-Infinity, 120.5, 134.5, Infinity), WrappedArray(-Infinity, 78.5, Infinity), WrappedArray(-Infinity, 2550.5, Infinity), WrappedArray(-Infinity, Infinity), WrappedArray(-Infinity, Infinity))] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
at org.apache.spark.ml.feature.DiscretizerModel$DiscretizerModelReader.load(MDLPDiscretizer.scala:249)
at org.apache.spark.ml.feature.DiscretizerModel$DiscretizerModelReader.load(MDLPDiscretizer.scala:232)
at org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:435)
at org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)
at org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)
at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:347)
at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:341)
at org.apache.spark.ml.util.MLReadable$class.load(ReadWrite.scala:215)
at org.apache.spark.ml.PipelineModel$.load(Pipeline.scala:331)
... 50 elided
Fix in file ml.feature.MDLPDiscretizer.scala: (line 258ff)
val splits = sqlContext.read.parquet(dataPath)
.select("splits")
.head().getAs[Seq[Seq[Float]]](0).map(arr => arr.toArray).toArray
Please, if possible, upload a PR with the change. It will better reflect your contribution to this repo.