seldon-server
seldon-server copied to clipboard
java.lang.AssertionError: assertion failed: lapack.dppsv returned 1. at scala.Predef$.assert(Predef.scala:179)
I am getting below error when training matrix-factorization on latest version of seldon-server 1.3.2 and 1.3.3.
16/06/16 06:26:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS 16/06/16 06:26:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS 16/06/16 06:26:45 WARN TaskSetManager: Lost task 0.0 in stage 57.0 (TID 39, 172.17.0.10): java.lang.AssertionError: assertion failed: lapack.dppsv returned 1. at scala.Predef$.assert(Predef.scala:179) at org.apache.spark.ml.recommendation.ALS$CholeskySolver.solve(ALS.scala:393) at org.apache.spark.ml.recommendation.ALS$$anonfun$org$apache$spark$ml$recommendation$ALS$$computeFactors$1.apply(ALS.scala:1170) at org.apache.spark.ml.recommendation.ALS$$anonfun$org$apache$spark$ml$recommendation$ALS$$computeFactors$1.apply(ALS.scala:1131) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$mapValues$1$$anonfun$apply$41$$anonfun$apply$42.apply(PairRDDFunctions.scala:700) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$mapValues$1$$anonfun$apply$41$$anonfun$apply$42.apply(PairRDDFunctions.scala:700) at scala.collection.Iterator$$anon$11.next(Iterator.scala:328) at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:278) at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171) at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78) at org.apache.spark.rdd.RDD.iterator(RDD.scala:262) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:300) at org.apache.spark.rdd.RDD.iterator(RDD.scala:264) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:88) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)
Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) at org.apache.spark.rdd.RDD$$anonfun$aggregate$1.apply(RDD.scala:1082) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1075) at org.apache.spark.ml.recommendation.ALS$.computeYtY(ALS.scala:1182) at org.apache.spark.ml.recommendation.ALS$.org$apache$spark$ml$recommendation$ALS$$computeFactors(ALS.scala:1123) at org.apache.spark.ml.recommendation.ALS$$anonfun$train$3.apply(ALS.scala:578) at org.apache.spark.ml.recommendation.ALS$$anonfun$train$3.apply(ALS.scala:575) at scala.collection.immutable.Range.foreach(Range.scala:141) at org.apache.spark.ml.recommendation.ALS$.train(ALS.scala:575) at org.apache.spark.mllib.recommendation.ALS.run(ALS.scala:239) at org.apache.spark.mllib.recommendation.ALS$.trainImplicit(ALS.scala:417)
smrutiranjans@smrutirn-ub:~/seldon-server/kubernetes/conf$ docker images REPOSITORY TAG IMAGE ID CREATED SIZE seldonio/seldon-stream 1.3.3 3d4b5505b2e6 2 days ago 728.3 MB seldonio/seldon-server 1.3.3 a17472a3a306 2 days ago 1.182 GB seldonio/mysql 1.0.1 eee2261bd6af 2 days ago 324.2 MB seldonio/seldon-control 2.0.1_v1 7c4e65c50538 2 days ago 3.283 GB seldonio/td-agent-server 1.0.5 80ae143f2cb5 4 days ago 433.1 MB wurstmeister/kafka latest 9d084d73a156 7 days ago 221.1 MB seldonio/grafana 1.0 abccee8b4538 2 weeks ago 230 MB seldonio/td-agent-node 1.0.2 453456d4e4b6 2 weeks ago 433.1 MB seldonio/zookeeper-k8s 1.0 bea89b8d2cbf 2 weeks ago 367.2 MB seldonio/influxdb 1.0 48951c1855e1 2 weeks ago 271.1 MB gcr.io/google_containers/hyperkube-amd64 v1.2.4 3c4f38def75b 5 weeks ago 316.8 MB gcr.io/google_containers/exechealthz 1.0 82a141f5d06d 11 weeks ago 7.116 MB gcr.io/google_containers/spark 1.5.2_v1 22712970844d 3 months ago 989.9 MB gcr.io/google_containers/etcd-amd64 2.2.1 3ae398308ded 4 months ago 28.19 MB gcr.io/google_containers/kube2sky 1.12 d4f3fadabe2b 6 months ago 24.48 MB gcr.io/google_containers/etcd 2.2.1 a6cd91debed1 7 months ago 28.19 MB gcr.io/google_containers/skydns 2015-10-13-8c72f8c 2927189088d7 8 months ago 40.55 MB gcr.io/google_containers/pause 2.0 2b58359142b0 8 months ago 350.2 kB seldonio/memcached 0.1 ed67562d41be 13 months ago 210.9 MB
root@seldon-control:/usr/lib/libblas# ls libblas.a libblas.so libblas.so.3 libblas.so.3.0
How are you running the Spark job? Can you run in Kubernetes the ml100k example? http://docs.seldon.io/ml100k.html
Hi Clive it worked... Log:-- 16/06/17 07:01:17 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS 16/06/17 07:01:17 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS training model took 9888ms /seldon-data/seldon-models/ml100k/matrix-factorization/1/userFeatures.txt.gz 16/06/17 07:01:22 WARN ExponentialBackoffRetry: maxRetries too large (100). Pinning to 29 16/06/17 07:01:22 INFO CuratorFrameworkImpl: Starting 16/06/17 07:01:22 INFO ZooKeeper: Initiating client connection, connectString=zookeeper-1:2181,zookeeper-2:2181,zookeeper-3:2181 sessionTimeout=60000 watcher=org.apache.curator.ConnectionState@75f35310 16/06/17 07:01:27 INFO ClientCnxn: Opening socket connection to server 10.0.0.26/10.0.0.26:2181. Will not attempt to authenticate using SASL (unknown error) 16/06/17 07:01:27 INFO ClientCnxn: Socket connection established to 10.0.0.26/10.0.0.26:2181, initiating session 16/06/17 07:01:27 INFO ClientCnxn: Session establishment complete on server 10.0.0.26/10.0.0.26:2181, sessionid = 0x1555d033a6f0051, negotiated timeout = 40000 16/06/17 07:01:27 INFO ConnectionStateManager: State change: CONNECTED 30.0,0.01,5.0,1.0,0.0,16635.0 Time taken 28627 Shutting down job
It was working when I was using different value and same type(as in ml100k), but failing if different type of actions are used, with same weight. Since I am trying on e-commerce and I need weighted action type(1: view, 2: cart, 3: sales), and also maxSum to normalize the data for each type. During initial batch data import I can do that using same type and different value, but I also need the MfModelCreation job to honor the limit when training on subsequent day on updated live action data. Please find some sample action data:
{"client": "test", "client_itemid": "181610640", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-22T02:35:03Z", "type": 3, "userid": 498994, "value": 1.0} {"client": "test", "client_itemid": "181610640", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-22T02:35:03Z", "type": 3, "userid": 498994, "value": 1.0} {"client": "test", "client_itemid": "181610640", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-22T02:35:03Z", "type": 3, "userid": 498994, "value": 1.0} {"client": "test", "client_itemid": "181610640", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-22T02:35:03Z", "type": 3, "userid": 498994, "value": 1.0} {"client": "test", "client_itemid": "69880072", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-21T23:50:28Z", "type": 3, "userid": 288630, "value": 1.0} {"client": "test", "client_itemid": "69880072", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-21T23:50:28Z", "type": 3, "userid": 288630, "value": 1.0} {"client": "test", "client_itemid": "69880072", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-21T23:50:28Z", "type": 3, "userid": 288630, "value": 1.0} {"client": "test", "client_itemid": "69880072", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-21T23:58:49Z", "type": 3, "userid": 288630, "value": 1.0} {"client": "test", "client_itemid": "69880072", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-21T23:58:49Z", "type": 3, "userid": 288630, "value": 1.0} {"client": "test", "client_itemid": "69880072", "client_userid": "149000000943", "itemid": 149000000943, "rectag": "default", "timestamp_utc": "1970-08-21T23:58:49Z", "type": 3, "userid": 288630, "value": 1.0}
As per below code: val actionsByScore:RDD[((Int,Int),Double)] = actionsByTypeCount.mapValues{ case x:Map[Int,Int]=> x.map { case y: (Int, Int) => Math.min(weightingsMap(y._1)._1 * y._2, weightingsMap(y.1).2) }.reduce(+) } Please find my configuration used: "config": { "activate": true, "alpha": 0.01, "days": 100, "inputPath": "%SELDON_MODELS%", "iterations": 20, "lambda": 0.1, "outputPath": "%SELDON_MODELS%", "rank": 30, "startDay": 1, "actionWeightings": {"actionMapping": [ {"actionType": 1, "valuePerAction": 1.0, "maxSum": 30.0}, {"actionType": 2, "valuePerAction": 5.0, "maxSum": 30.0}, {"actionType": 3, "valuePerAction": 3.0, "maxSum": 30.0} ] } }
Please let me know how I can fix it. It would be nice if you can let me know how to quickly debug the MfModelCreation job (without building seldon-control using docker and deploying with kubernetes).
The MfCreationJob is in seldon-server/offline-jobs/spark This is a Java 7 project with pom.xml, so you should be able to load it in your favourite Java editor - e.g. Eclipse or IntelliJ and modify the code and run locally on files.
Thanks Clive.. Issue got fixed after using correct "actionWeightings" configuration.
Issues go stale after 90d of inactivity.
Mark the issue as fresh with /remove-lifecycle stale
.
Stale issues rot after an additional 30d of inactivity and eventually close.
If this issue is safe to close now please do so with /close
.
/lifecycle stale