Running test example failed
./angel-submit
-Daction.type=train
-Dangel.app.submit.class=com.tencent.angel.ml.lda.LDARunner
-Dml.model.class.name=com.tencent.angel.ml.lda.LDAModel
-Dangel.train.data.path="hdfs://10.1.3.15:8020//angel_data/data/abalone/abalone_8d_train.libsvm"
-Dangel.log.path="hdfs://10.1.3.15:8020/angel_data/model"
-Dangel.save.model.path="hdfs://10.1.3.15:8020/angel_data/model"
-Dsave.doc.topic=true
-Dsave.word.topic=true
-Dml.epoch.num=10
-Dml.data.type=dummy
-Dml.feature.index.range=1024
-Dangel.job.name=LDAtest
-Dangel.am.memory.gb=2
-Dangel.worker.memory.gb=2
-Dangel.ps.memory.gb=4
-Dangel.staging.dir="hdfs://10.1.3.15:8020/angel_data/model"
--queue queue01
-Dangel.output.path.deleteonexist=true
Log:
INFO yarn.AngelYarnClient: ApplicationSubmissionContext Queuename : queue02 19/09/16 17:49:21 INFO impl.YarnClientImpl: Submitted application application_1568606003884_0048 19/09/16 17:49:29 INFO yarn.AngelYarnClient: appMaster getTrackingUrl = http://slave1:8088/cluster/app/application_1568606003884_0048/ 19/09/16 17:49:29 INFO yarn.AngelYarnClient: master host=10.1.3.16, port=26604 19/09/16 17:49:29 INFO yarn.AngelYarnClient: start to create rpc client to am 19/09/16 17:49:30 INFO client.AngelClient: clientId=1 19/09/16 17:50:45 ERROR yarn.AngelYarnClient: submit application to yarn failed. com.google.protobuf.ServiceException: java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:317) at com.sun.proxy.$Proxy25.getAllPSLocation(Unknown Source) at com.tencent.angel.client.AngelClient.waitForAllPS(AngelClient.java:1108) at com.tencent.angel.client.yarn.AngelYarnClient.startPSServer(AngelYarnClient.java:172) at com.tencent.angel.ml.lda.LDARunner.train(LDARunner.scala:107) at com.tencent.angel.ml.core.MLRunner$class.submit(MLRunner.scala:45) at com.tencent.angel.ml.lda.LDARunner.submit(LDARunner.scala:29) at com.tencent.angel.utils.AngelRunJar$1.run(AngelRunJar.java:91) at com.tencent.angel.utils.AngelRunJar$1.run(AngelRunJar.java:77) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1724) at com.tencent.angel.utils.AngelRunJar.submit(AngelRunJar.java:77) at com.tencent.angel.utils.AngelRunJar.main(AngelRunJar.java:44) Caused by: java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.ipc.CallFuture.get(CallFuture.java:121) at com.tencent.angel.ipc.NettyTransceiver.call(NettyTransceiver.java:297) at com.tencent.angel.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:294) ... 13 more Caused by: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.ipc.NettyTransceiver.getChannel(NettyTransceiver.java:149) at com.tencent.angel.ipc.NettyTransceiver.transceive(NettyTransceiver.java:338) at com.tencent.angel.ipc.NettyTransceiver.call(NettyTransceiver.java:292) ... 14 more Caused by: java.net.ConnectException: Connection refused: /10.1.3.16:26604 at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:208) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:287) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:116) at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:137) at java.lang.Thread.run(Thread.java:745) 19/09/16 17:50:45 ERROR utils.AngelRunJar: submit job failed com.tencent.angel.exception.AngelException: com.google.protobuf.ServiceException: java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.client.yarn.AngelYarnClient.startPSServer(AngelYarnClient.java:176) at com.tencent.angel.ml.lda.LDARunner.train(LDARunner.scala:107) at com.tencent.angel.ml.core.MLRunner$class.submit(MLRunner.scala:45) at com.tencent.angel.ml.lda.LDARunner.submit(LDARunner.scala:29) at com.tencent.angel.utils.AngelRunJar$1.run(AngelRunJar.java:91) at com.tencent.angel.utils.AngelRunJar$1.run(AngelRunJar.java:77) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1724) at com.tencent.angel.utils.AngelRunJar.submit(AngelRunJar.java:77) at com.tencent.angel.utils.AngelRunJar.main(AngelRunJar.java:44) Caused by: com.google.protobuf.ServiceException: java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:317) at com.sun.proxy.$Proxy25.getAllPSLocation(Unknown Source) at com.tencent.angel.client.AngelClient.waitForAllPS(AngelClient.java:1108) at com.tencent.angel.client.yarn.AngelYarnClient.startPSServer(AngelYarnClient.java:172) ... 10 more Caused by: java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.ipc.CallFuture.get(CallFuture.java:121) at com.tencent.angel.ipc.NettyTransceiver.call(NettyTransceiver.java:297) at com.tencent.angel.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:294) ... 13 more Caused by: java.io.IOException: Error connecting to /10.1.3.16:26604 at com.tencent.angel.ipc.NettyTransceiver.getChannel(NettyTransceiver.java:149) at com.tencent.angel.ipc.NettyTransceiver.transceive(NettyTransceiver.java:338) at com.tencent.angel.ipc.NettyTransceiver.call(NettyTransceiver.java:292) ... 14 more Caused by: java.net.ConnectException: Connection refused: /10.1.3.16:26604 at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717) at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:208) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:287) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:116) at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:137) at java.lang.Thread.run(Thread.java:745)
java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604
我也遇到了这个问题 尝试以下操作后成功了
- 关闭hadoop集群
- 重启
- 重启hadoop集群
java.util.concurrent.ExecutionException: java.io.IOException: Error connecting to /10.1.3.16:26604
我也遇到了这个问题 尝试以下操作后成功了
- 关闭hadoop集群
- 重启
- 重启hadoop集群
我的重启后还是不行
重启后也是不行