a span ended in an async function can become of a parent of spans after that function ends
The Problem
const apm = require('elastic-apm-node').start()
async function somethingToAwait () {
return 42
}
async function firstThing () {
// await Promise.resolve()
const span = apm.startSpan('firstThing')
await somethingToAwait()
span.end()
}
async function secondThing () {
const span = apm.startSpan('secondThing')
// ...
span.end()
}
async function main () {
const trans = apm.startTransaction('myTransaction')
await firstThing()
await secondThing()
trans.end()
}
main()
Tracing this results in this trace:
trace 5d54ad
`- transaction 731c00 "myTransaction"
`- span 67b9ad "firstThing"
`- span 2f21b1 "secondThing"
The problem is that secondThing is a child of firstThing when we would not expect it to be so.
Workaround
If you comment out await somethingToAwait() (i.e. have an async function that does not use await) then the parent/child relationships are as expected. However, obviously, avoiding await isn't a reasonable solution.
trace b877f3
`- transaction 6c3739 "myTransaction"
`- span 285f98 "firstThing"
`- span 54ac1b "secondThing"
A different workaround is to add await Promise.resolve() before apm.startSpan(...).
This is effectively the solution that Dario has used in @kbn/apm-utils's withSpan(): https://github.com/elastic/kibana/blob/595b6948a1492a04fff14c8d6b2f8b50308681c0/packages/kbn-apm-utils/src/index.ts#L54
This workaround is still required after the run context work, as discuss in the comments starting at: https://github.com/elastic/apm-agent-nodejs/pull/2181#issuecomment-949605365
(/cc @AlexanderWert This is the same thing you have hit in your Lambda example.)
To be continued...
tl;dr: Execution of an async function creates and returns a Promise either when it hits return (or the end of the function) or when it hits the first await. The way run context (a.k.a. async context) tracking is done with async_hooks, is to (indirectly) attach the current tracing state to newly created async tasks. When await somethingToAwait() is hit, the currentSpan is Span firstThing, and this is returned to the calling function main. When main resumes, its current execution async task is the same as that promise. By design, it doesn't matter that Span firstThing has ended, it is still used as the parent for the subsequently created Span secondThing. Why "by design"? See https://github.com/elastic/apm-agent-nodejs/blob/main/test/instrumentation/run-context/fixtures/parentage-with-ended-span.js
TODO: I'll add some details, and why I think an automatic fix for startSpan() may not be feasible, and adding a withSpan API (akin to OTel's withActiveSpan() and similar in other tracers) is an appropriate fix.
Note to self: Here is a diff to the "test/instrumentation/run-context/fixtures/ls-await.js" test fixture to show triggering this error case. A modified version of this might be good to eventually use as a test case:
diff --git a/test/instrumentation/run-context/fixtures/ls-await.js b/test/instrumentation/run-context/fixtures/ls-await.js
index 00cfce7d..f03a6084 100644
--- a/test/instrumentation/run-context/fixtures/ls-await.js
+++ b/test/instrumentation/run-context/fixtures/ls-await.js
@@ -16,6 +16,7 @@ var apm = require('../../../../').start({ // elastic-apm-node
})
let assert = require('assert')
+const { executionAsyncId, triggerAsyncId } = require('async_hooks')
if (Number(process.versions.node.split('.')[0]) > 8) {
assert = assert.strict
}
@@ -23,22 +24,34 @@ const fsp = require('fs').promises
let t1
+async function myFunc () {
+ console.log('XXX [xid=%s tid=%s] myFunc start: %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._runCtxMgr)
+ return 42
+}
+
async function getCwd () {
var s2 = apm.startSpan('cwd')
+ console.log('XXX [xid=%s tid=%s] before await myFunc: %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._runCtxMgr)
+ await myFunc() // XXX await or not await
+ console.log('XXX [xid=%s tid=%s] after await myFunc: %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._runCtxMgr)
try {
return process.cwd()
} finally {
assert(apm.currentTransaction === t1)
- assert(apm.currentSpan === s2)
+ // XXX assert(apm.currentSpan === s2)
s2.end()
+ console.log('XXX [xid=%s tid=%s] getCwd: after manual s2.end(): %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._
runCtxMgr)
}
}
async function main () {
+ console.log('XXX [xid=%s tid=%s] main start: %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._runCtxMgr)
t1 = apm.startTransaction('ls')
assert(apm.currentTransaction === t1)
try {
+ console.log('XXX [xid=%s tid=%s] before getCwd: %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._runCtxMgr)
const cwd = await getCwd()
+ console.log('XXX [xid=%s tid=%s] after getCwd: %s', executionAsyncId(), triggerAsyncId(), apm._instrumentation._runCtxMgr)
let entries
var s3 = apm.startSpan('readdir')
@@ -50,7 +63,7 @@ async function main () {
assert(apm.currentSpan === s3)
s3.end()
}
- assert(apm.currentSpan === null)
+ // XXX assert(apm.currentSpan === null)
console.log('entries:', entries)
} finally {
The full ls-await.js is here: https://gist.github.com/trentm/721ab01162b63b824e4be1528e030afc