buildkit icon indicating copy to clipboard operation
buildkit copied to clipboard

scheduler "return leaving outgoing open" during parallel job cancel

Open jedevc opened this issue 1 year ago • 1 comments

Bit of a brain dump - while exploring another related scheduler bug (attempting to investigate https://github.com/moby/buildkit/issues/3635), I managed to construct an interesting (and very reproducible!) test case that could potentially occur:

// scheduler_test.go

func TestParallelCancel(t *testing.T) {
	t.Parallel()

	// this is quite rare 😥
	for i := 0; i < 10000; i++ {
		ctx := context.TODO()

		cacheManager := newTrackingCacheManager(NewInMemoryCacheManager())

		l := NewSolver(SolverOpt{
			ResolveOpFunc: testOpResolver,
			DefaultCache:  cacheManager,
		})
		defer l.Close()

		j0, err := l.NewJob("j0")
		require.NoError(t, err)
		j1, err := l.NewJob("j1")
		require.NoError(t, err)

		g := Edge{
			Vertex: vtxSum(1, vtxOpt{
				inputs: []Edge{
					{Vertex: vtxConst(3, vtxOpt{})},
					{Vertex: vtxConst(2, vtxOpt{})},
				},
			}),
		}

		canceledCtx, cancel := context.WithCancel(ctx)
		cancel()

		eg := errgroup.Group{}
		eg.Go(func() error {
			// a normal build
			_, err := j0.Build(ctx, g)
			return err
		})
		eg.Go(func() error {
			// a build that is launched, and then pretty immediately cancelled
			// NOTE: the failure only occurs if they are run in parallel! in serial, these succeed!
			_, err := j1.Build(canceledCtx, g)
			if err != nil && err != context.Canceled {
				return err
			}
			return nil
		})
		require.NoError(t, eg.Wait())
	}
}

This fails with:

$ go test -run "TestParallelCancel$" -v
=== RUN   TestParallelCancel
=== PAUSE TestParallelCancel
=== CONT  TestParallelCancel
    scheduler_test.go:3259: 
        	Error Trace:	/home/jedevc/Documents/Projects/moby/buildkit/solver/scheduler_test.go:3259
        	Error:      	Received unexpected error:
        	            	buildkit scheduler error: return leaving outgoing open. Please report this with BUILDKIT_SCHEDULER_DEBUG=1
        	            	github.com/moby/buildkit/solver.(*scheduler).dispatch
        	            		/home/jedevc/Documents/Projects/moby/buildkit/solver/scheduler.go:214
        	            	github.com/moby/buildkit/solver.(*scheduler).loop
        	            		/home/jedevc/Documents/Projects/moby/buildkit/solver/scheduler.go:118
        	            	runtime.goexit
        	            		/usr/lib/go/src/runtime/asm_amd64.s:1695
        	Test:       	TestParallelCancel
--- FAIL: TestParallelCancel (1.12s)
FAIL
exit status 1
FAIL	github.com/moby/buildkit/solver	1.126s

Maybe a bit of an edge case? But also this does seem to reproduce fairly consistently, and could potentially be another nastier issue.

jedevc avatar Mar 04 '24 18:03 jedevc

I also found this while working on this, related but a bit different (also not sure if it's even valid):

func TestImmediateCancel(t *testing.T) {
	t.Parallel()

	for i := 0; i < 10000; i++ {
		ctx := context.TODO()

		cacheManager := newTrackingCacheManager(NewInMemoryCacheManager())

		l := NewSolver(SolverOpt{
			ResolveOpFunc: testOpResolver,
			DefaultCache:  cacheManager,
		})
		defer l.Close()

		j0, err := l.NewJob("j0")
		require.NoError(t, err)

		g := Edge{
			Vertex: vtxSum(1, vtxOpt{
				inputs: []Edge{
					{Vertex: vtxConst(3, vtxOpt{})},
					{Vertex: vtxConst(2, vtxOpt{})},
				},
			}),
		}

		// is it valid to call discard in parallel with Build?
		go j0.Discard()

		_, err = j0.Build(ctx, g)
		require.NoError(t, err)
	}
}
$ go test -run "TestImmediateCancel$" -v
=== RUN   TestImmediateCancel
=== PAUSE TestImmediateCancel
=== CONT  TestImmediateCancel
    scheduler_test.go:3294: 
        	Error Trace:	/home/jedevc/Documents/Projects/moby/buildkit/solver/scheduler_test.go:3294
        	Error:      	Received unexpected error:
        	            	failed to get edge: inconsistent graph state
        	            	github.com/moby/buildkit/solver.(*pipeFactory).NewInputRequest.func1
        	            		/home/jedevc/Documents/Projects/moby/buildkit/solver/scheduler.go:373
        	            	github.com/moby/buildkit/solver/internal/pipe.NewWithFunction.func2
        	            		/home/jedevc/Documents/Projects/moby/buildkit/solver/internal/pipe/pipe.go:82
        	            	runtime.goexit
        	            		/usr/lib/go/src/runtime/asm_amd64.s:1695
        	Test:       	TestImmediateCancel
--- FAIL: TestImmediateCancel (0.00s)
FAIL
exit status 1
FAIL	github.com/moby/buildkit/solver	0.009s

jedevc avatar Mar 04 '24 18:03 jedevc