UI does not accomodate jobs with many children
We have jobs that can have 500 children, who themselves can have 500 children etc. This leads to a pipeline comprised of up to 100 000 jobs.
In that situation, the UI attempts to load all the objects in the pipeline tree in one go and exceeds the memory available to the App Engine instance.
A better option would be to progressively load the child jobs (and the attached slots, error entities etc) as they are required, starting with the jobs that are direct children of the pipeline. The child jobs would only be loaded when the user tries to expand part of the UI.
For information, the use case for those 100 000 jobs is iterating over all the users in a Google Apps domain. We can iterate by pages of 500 users, and we span a job for each user.
This actually sounds like a use case for MapReduce: https://github.com/GoogleCloudPlatform/appengine-mapreduce
It is built on pipelines and is designed to handle exactly this sort of thing efficiently.
Are you getting?
/_ah/pipeline/rpc/tree
com.google.apphosting.api.ApiProxy$UnknownException: An error occurred for the API request datastore_v3.Next().
at com.google.apphosting.runtime.ApiProxyImpl$AsyncApiFuture.setRpcError(ApiProxyImpl.java:545)
at com.google.apphosting.runtime.ApiProxyImpl$AsyncApiFuture.failure(ApiProxyImpl.java:509)
at com.google.net.rpc3.client.RpcStub$RpcCallbackDispatcher$1.runInContext(RpcStub.java:858)
at com.google.tracing.TraceContext$TraceContextRunnable$1.run(TraceContext.java:444)
at com.google.tracing.CurrentContext.runInContext(CurrentContext.java:220)
at com.google.tracing.TraceContext$AbstractTraceContextCallback.runInInheritedContextNoUnref(TraceContext.java:308)
at com.google.tracing.TraceContext$AbstractTraceContextCallback.runInInheritedContext(TraceContext.java:300)
at com.google.tracing.TraceContext$TraceContextRunnable.run(TraceContext.java:441)
at com.google.net.rpc3.client.RpcStub$RpcCallbackDispatcher.runCallback(RpcStub.java:894)
at com.google.net.rpc3.client.RpcStub$RpcCallbackDispatcher.rpcFinished(RpcStub.java:904)
at com.google.net.rpc3.client.RpcStub$RpcCallbackDispatcher.failure(RpcStub.java:889)
at com.google.net.rpc3.impl.client.RpcClientInternalContext.runCallbacks(RpcClientInternalContext.java:1133)
at com.google.net.rpc3.impl.client.RpcClientInternalContext.finishRpcAndNotifyApp(RpcClientInternalContext.java:1030)
at com.google.net.rpc3.impl.client.RpcNetChannel.afterFinishingActiveRpc(RpcNetChannel.java:1459)
at com.google.net.rpc3.impl.client.RpcNetChannel.finishRpc(RpcNetChannel.java:1280)
at com.google.net.rpc3.impl.client.RpcNetChannel.handleResponse(RpcNetChannel.java:2995)
at com.google.net.rpc3.impl.client.RpcNetChannel.messageReceived(RpcNetChannel.java:2731)
at com.google.net.rpc3.impl.client.RpcNetChannel.access$2900(RpcNetChannel.java:175)
at com.google.net.rpc3.impl.client.RpcNetChannel$TransportCallback.receivedMessage(RpcNetChannel.java:3922)
at com.google.net.rpc3.impl.client.RpcChannelTransportData$TransportCallback.receivedMessage(RpcChannelTransportData.java:669)
at com.google.net.rpc3.impl.wire.RpcBaseTransport.receivedMessage(RpcBaseTransport.java:457)
at com.google.apphosting.runtime.udrpc.UdrpcTransport$ClientAdapter.receivedMessage(UdrpcTransport.java:577)
at com.google.apphosting.runtime.udrpc.UdrpcTransport.dispatchPacket(UdrpcTransport.java:386)
at com.google.apphosting.runtime.udrpc.UdrpcTransport.readPackets(UdrpcTransport.java:283)
at com.google.apphosting.runtime.udrpc.UdrpcTransport$1.run(UdrpcTransport.java:100)
at com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:260)
at com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:121)
at com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:594)
at com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1011)
at com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:892)
at com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:161)
at com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1879)
Getting this exception for bigger pipelines.
Hi William, no I most commonly get this error :
Error for /_ah/pipeline/rpc/tree
java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:2367)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:114)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:416)
at java.lang.StringBuffer.append(StringBuffer.java:237)
at org.json.JSONObject.toString(JSONObject.java:1386)
at org.json.JSONObject.toString(JSONObject.java:1340)
at com.google.appengine.tools.pipeline.impl.util.JsonUtils.mapToJson(JsonUtils.java:36)
at com.google.appengine.tools.pipeline.impl.servlets.JsonGenerator.pipelineObjectsToJson(JsonGenerator.java:103)
at com.google.appengine.tools.pipeline.impl.servlets.JsonTreeHandler.doGet(JsonTreeHandler.java:58)
at com.google.appengine.tools.pipeline.impl.servlets.PipelineServlet.doGet(PipelineServlet.java:97)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:617)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
at org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:511)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1166)
at eu.revevol.signaturemanager.server.jobs.PipelineRPCNamespaceFilter.doFilter(PipelineRPCNamespaceFilter.java:40)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.googlecode.objectify.cache.AsyncCacheFilter.doFilter(AsyncCacheFilter.java:59)
at com.googlecode.objectify.ObjectifyFilter.doFilter(ObjectifyFilter.java:49)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.utils.servlet.ParseBlobUploadFilter.doFilter(ParseBlobUploadFilter.java:125)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.runtime.jetty.SaveSessionFilter.doFilter(SaveSessionFilter.java:35)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.utils.servlet.JdbcMySqlConnectionCleanupFilter.doFilter(JdbcMySqlConnectionCleanupFilter.java:60)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.utils.servlet.TransactionCleanupFilter.doFilter(TransactionCleanupFilter.java:43)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:388)
at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:765)
As usual with memory related errors on App Engine, the process is sometimes killed without explanations in which case I see this message :
The process handling this request unexpectedly died. This is likely to cause a new process to be used for the next request to your application. (Error code 203)
dhatanian: Your stack trace is a dup of: https://github.com/GoogleCloudPlatform/appengine-pipelines/issues/1
The fix is two fold: Retrieve less info in the UI (The data per item is currently quite large) and to bound the number of items retrieved.
I am going to start work on this on my local fork. Here are my initial thoughts
- It is fine to get all instance slots, barriers, instances etc for a child or set of children (there will be no paging for this operation)
- every job will only query for itself and immediate children, so the root job will get its children and their data, once a job is selected the status.html will send another query to get the slots, barriers, instances for the selected items children and so on.
This will probably require changes to:
- the pipeline manager
- the JsonTreeHandler or maybe require a new handler
- the status.html
- probably others
Give me a shout if you have any other thoughts
@dhatanian if you are inclined to do a little experiment, it might be worth upping the spec of your app engine instances for a little bit to see whether you get a datastore issue once the memory is no longer an issue.
I totally agree with your initial thoughts. I have upgraded my instances type to F4_1G, and the most common error I get now is indeed a datastore issue :
com.google.appengine.api.datastore.DatastoreFailureException: Unexpected failure
at com.google.appengine.api.datastore.FutureHelper.getInternal(FutureHelper.java:78)
at com.google.appengine.api.datastore.FutureHelper.quietGet(FutureHelper.java:35)
at com.google.appengine.api.datastore.BaseQueryResultsSource.loadMoreEntities(BaseQueryResultsSource.java:186)
at com.google.appengine.api.datastore.QueryResultIteratorImpl.ensureLoaded(QueryResultIteratorImpl.java:161)
at com.google.appengine.api.datastore.QueryResultIteratorImpl.nextList(QueryResultIteratorImpl.java:115)
at com.google.appengine.api.datastore.LazyList.forceResolveToIndex(LazyList.java:93)
at com.google.appengine.api.datastore.LazyList.resolveToIndex(LazyList.java:73)
at com.google.appengine.api.datastore.LazyList.resolveToIndex(LazyList.java:56)
at com.google.appengine.api.datastore.LazyList.access$000(LazyList.java:28)
at com.google.appengine.api.datastore.LazyList$1.hasNext(LazyList.java:174)
at com.google.appengine.tools.pipeline.impl.backend.AppEngineBackEnd.queryFullPipeline(AppEngineBackEnd.java:545)
at com.google.appengine.tools.pipeline.impl.PipelineManager.queryFullPipeline(PipelineManager.java:323)
at com.google.appengine.tools.pipeline.impl.servlets.JsonTreeHandler.doGet(JsonTreeHandler.java:57)
at com.google.appengine.tools.pipeline.impl.servlets.PipelineServlet.doGet(PipelineServlet.java:97)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:617)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
at org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:511)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1166)
at eu.revevol.signaturemanager.server.jobs.PipelineRPCNamespaceFilter.doFilter(PipelineRPCNamespaceFilter.java:40)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.googlecode.objectify.cache.AsyncCacheFilter.doFilter(AsyncCacheFilter.java:59)
at com.googlecode.objectify.ObjectifyFilter.doFilter(ObjectifyFilter.java:49)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.utils.servlet.ParseBlobUploadFilter.doFilter(ParseBlobUploadFilter.java:125)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.runtime.jetty.SaveSessionFilter.doFilter(SaveSessionFilter.java:35)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.utils.servlet.JdbcMySqlConnectionCleanupFilter.doFilter(JdbcMySqlConnectionCleanupFilter.java:60)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at com.google.apphosting.utils.servlet.TransactionCleanupFilter.doFilter(TransactionCleanupFilter.java:43)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)
at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:388)
at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:765)
at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:418)
at com.google.apphosting.runtime.jetty.AppVersionHandlerMap.handle(AppVersionHandlerMap.java:254)
at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
at org.mortbay.jetty.Server.handle(Server.java:326)
at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
at org.mortbay.jetty.HttpConnection$RequestHandler.headerComplete(HttpConnection.java:923)
at com.google.apphosting.runtime.jetty.RpcRequestParser.parseAvailable(RpcRequestParser.java:76)
at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404)
at com.google.apphosting.runtime.jetty.JettyServletEngineAdapter.serviceRequest(JettyServletEngineAdapter.java:146)
at com.google.apphosting.runtime.JavaRuntime$RequestRunnable.run(JavaRuntime.java:527)
at com.google.tracing.TraceContext$TraceContextRunnable.runInContext(TraceContext.java:437)
at com.google.tracing.TraceContext$TraceContextRunnable$1.run(TraceContext.java:444)
at com.google.tracing.CurrentContext.runInContext(CurrentContext.java:220)
at com.google.tracing.TraceContext$AbstractTraceContextCallback.runInInheritedContextNoUnref(TraceContext.java:308)
at com.google.tracing.TraceContext$AbstractTraceContextCallback.runInInheritedContext(TraceContext.java:300)
at com.google.tracing.TraceContext$TraceContextRunnable.run(TraceContext.java:441)
at com.google.apphosting.runtime.ThreadGroupPool$PoolEntry.run(ThreadGroupPool.java:251)
at java.lang.Thread.run(Thread.java:724)
Caused by: java.lang.InterruptedException
at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:374)
at com.google.common.util.concurrent.AbstractFuture$TrustedFuture.get(AbstractFuture.java:85)
at com.google.appengine.tools.development.TimedFuture.get(TimedFuture.java:42)
at com.google.common.util.concurrent.ForwardingFuture.get(ForwardingFuture.java:63)
at com.google.appengine.api.utils.FutureWrapper.get(FutureWrapper.java:88)
at com.google.appengine.api.datastore.FutureHelper.getInternal(FutureHelper.java:75)
... 52 more