spanner: Blind retry of RESOURCE_EXHAUSTED status on Read can hang indefinitely
It appears that #9739 added RESOURCE_EXHAUSTED to the list of status codes to retry, but it fails to distinguish between the gRPC-generated RESOURCE_EXHAUSTED that's generated when a request payload is larger than the configured limit.
Client
spanner v1.60.0+
Environment
any, but Linux 6.1.0 (Ubuntu 23.10)
Code and Dependencies
Creating a KeySet that's too large for a single gRPC request.
Here's an example main package that exits immediately with an error with v1.59.0 and hangs indefinitely (due to internal retries) with v1.60.0+.
package main
import (
"context"
"fmt"
"os"
"strconv"
"cloud.google.com/go/spanner"
"cloud.google.com/go/spanner/spannertest"
"cloud.google.com/go/spanner/spansql"
"google.golang.org/api/option"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
)
func run() error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
srv, testSpanErr := spannertest.NewServer("localhost:0")
if testSpanErr != nil {
return fmt.Errorf("failed to create new test spanner: %w", testSpanErr)
}
const tblName = "fizzlebat"
const keyColumn = "foobar"
const dataColumn = "foolbit"
if createErr := srv.UpdateDDL(&spansql.DDL{List: []spansql.DDLStmt{
&spansql.CreateTable{
Name: tblName,
Columns: []spansql.ColumnDef{
{
Name: keyColumn,
NotNull: true,
Type: spansql.Type{
Array: false,
Len: spansql.MaxLen,
Base: spansql.String,
},
},
{
Name: dataColumn,
NotNull: true,
Type: spansql.Type{
Array: false,
Len: spansql.MaxLen,
Base: spansql.String,
},
},
},
PrimaryKey: []spansql.KeyPart{{Column: keyColumn, Desc: false}},
}}}); createErr != nil {
panic(fmt.Errorf("failed to create table %q: %s", tblName, createErr))
}
defer srv.Close()
conn, cErr := grpc.NewClient(srv.Addr, grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithDefaultCallOptions(grpc.MaxCallSendMsgSize(1_000_000)))
if cErr != nil {
return fmt.Errorf("failed to dial test spanner: %w", cErr)
}
const dbName = "projects/vimeo-starlord-dev-inmem/instances/inmem/databases/foobar"
client, clientErr := spanner.NewClient(ctx, dbName, option.WithGRPCConn(conn))
if clientErr != nil {
return fmt.Errorf("failed to init spanner client: %w", clientErr)
}
// Make sure everything gets cleaned up from the clients
defer client.Close()
// start counting at 100B so we get 11 digits (in decimal)
// use sequential IDs because spantest is naively implemented and keeps
// a map with the keys->data plus a separate sorted slice with primary
// keys. Since it sorts after every key insertion this becomes
// O((n logn)^2) (and possibly worse) for random key insertions, while
// if we use sequential keys, it doesn't need to re-sort (just verify
// that the slice is sorted (which is O(n), making
// this just O(n^2) overall).
const lowID = 100_000_000_000
dbids := [100_000]spanner.Key{}
for z := range dbids {
dbids[z] = spanner.Key{strconv.FormatInt(int64(z)+lowID, 10)}
}
ks := spanner.KeySetFromKeys(dbids[:]...)
iter := client.Single().Read(ctx, tblName, ks, []string{keyColumn, dataColumn})
if iterErr := iter.Do(func(*spanner.Row) error { return nil }); iterErr != nil {
return fmt.Errorf("failed to iterate: %w", iterErr)
}
return nil
}
func main() {
if err := run(); err != nil {
fmt.Fprintf(os.Stderr, "failure: %s\n", err)
os.Exit(1)
}
}
go.mod
module example.com/fizzlebat
go 1.23.3
require (
cloud.google.com/go/spanner v1.72.0
google.golang.org/api v0.203.0
google.golang.org/grpc v1.67.1
)
require (
cel.dev/expr v0.16.0 // indirect
cloud.google.com/go v0.116.0 // indirect
cloud.google.com/go/auth v0.9.9 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.4 // indirect
cloud.google.com/go/compute/metadata v0.5.2 // indirect
cloud.google.com/go/iam v1.2.1 // indirect
cloud.google.com/go/longrunning v0.6.1 // indirect
cloud.google.com/go/monitoring v1.21.1 // indirect
github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 // indirect
github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.24.1 // indirect
github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cncf/xds/go v0.0.0-20240822171458-6449f94b4d59 // indirect
github.com/envoyproxy/go-control-plane v0.13.0 // indirect
github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/google/s2a-go v0.1.8 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
github.com/googleapis/gax-go/v2 v2.13.0 // indirect
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/detectors/gcp v1.29.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect
go.opentelemetry.io/otel v1.29.0 // indirect
go.opentelemetry.io/otel/metric v1.29.0 // indirect
go.opentelemetry.io/otel/sdk v1.29.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.29.0 // indirect
go.opentelemetry.io/otel/trace v1.29.0 // indirect
golang.org/x/crypto v0.28.0 // indirect
golang.org/x/net v0.30.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.26.0 // indirect
golang.org/x/text v0.19.0 // indirect
golang.org/x/time v0.7.0 // indirect
google.golang.org/genproto v0.0.0-20241015192408-796eee8c2d53 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20241007155032-5fefd90f89a9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20241015192408-796eee8c2d53 // indirect
google.golang.org/protobuf v1.35.1 // indirect
)
Expected behavior
The call should fail with some wrapping of (preferably with status RESOURCE_EXHAUSTED):
"trying to send message larger than max (1710063 vs. 1000000)"
In this case, the error I get with v1.59.0 is:
failure: failed to iterate: spanner: code = "ResourceExhausted", desc = "trying to send message larger than max (1800054 vs. 1000000)"
Actual behavior
The client gets into a retry loop and stalls.
Additional context
It appears that this is a regression in v1.60.0
Note: we discovered this in our tests for code that handles this case, and have already rearranged the code to do proactive checking of the keyset size rather than optimistically making the request first and only checking the size on error. It's more important to us that the fix be robust than fast. (I realize that it may take some time to side-channel additional info from the spanner frontend)
cc: @Vizerai @rahul2393