spanner: sessions are not being closed when client is closed
I was debugging https://github.com/GoogleCloudPlatform/cloud-spanner-emulator/issues/159 problem and found out that using spanner.Client leaves about 100 sessions open per client. This in test environments can lead to a situation where creating 1000 clients ands up creating 80k sessions. This slows down trivial queries with new clients to 1s+.
It seems that client.Close does not close the batch created sessions.
Code
Reproducer
package main
import (
"context"
"errors"
"fmt"
"os"
"os/signal"
"time"
"cloud.google.com/go/spanner"
database "cloud.google.com/go/spanner/admin/database/apiv1"
"cloud.google.com/go/spanner/admin/database/apiv1/databasepb"
instance "cloud.google.com/go/spanner/admin/instance/apiv1"
"cloud.google.com/go/spanner/admin/instance/apiv1/instancepb"
spannerdirect "cloud.google.com/go/spanner/apiv1"
"cloud.google.com/go/spanner/apiv1/spannerpb"
"golang.org/x/sync/errgroup"
"google.golang.org/api/iterator"
"google.golang.org/api/option"
"google.golang.org/api/option/internaloption"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
)
func main() {
err := run()
if err != nil {
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
}
func run() error {
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
const (
ProjectID, InstanceID, DatabaseID = "proj", "inst", "db"
Database = "projects/" + ProjectID + "/instances/" + InstanceID + "/databases/" + DatabaseID
)
{ // create a new instance
admin, err := instance.NewInstanceAdminClient(ctx)
if err != nil {
return fmt.Errorf("failed to create instance admin: %w", err)
}
op, err := admin.CreateInstance(ctx, &instancepb.CreateInstanceRequest{
Parent: "projects/" + ProjectID,
InstanceId: InstanceID,
})
if err != nil {
if spanner.ErrCode(err) == codes.AlreadyExists {
goto instanceReady
}
return fmt.Errorf("create instance failed: %w", err)
}
if _, err := op.Wait(ctx); err != nil {
return fmt.Errorf("failed to wait instance creation: %w", err)
}
instanceReady:
if err := admin.Close(); err != nil {
return fmt.Errorf("failed to close instance admin: %w", err)
}
}
{ // create a database
admin, err := database.NewDatabaseAdminClient(ctx)
if err != nil {
return fmt.Errorf("failed to create database admin: %w", err)
}
ddl, err := admin.CreateDatabase(ctx, &databasepb.CreateDatabaseRequest{
Parent: "projects/" + ProjectID + "/instances/" + InstanceID,
DatabaseDialect: databasepb.DatabaseDialect_GOOGLE_STANDARD_SQL,
CreateStatement: "CREATE DATABASE " + DatabaseID,
ExtraStatements: []string{},
})
if err != nil {
if spanner.ErrCode(err) == codes.AlreadyExists {
goto databaseReady
}
return fmt.Errorf("failed to create database: %w", err)
}
if _, err := ddl.Wait(ctx); err != nil {
return fmt.Errorf("failed to wait database creation: %w", err)
}
databaseReady:
if err := admin.Close(); err != nil {
return fmt.Errorf("failed to close database admin: %w", err)
}
}
const N = 1000
var group errgroup.Group
group.SetLimit(128)
for k := 0; k < N; k++ {
if ctx.Err() != nil {
break
}
k := k
group.Go(func() error {
client, err := spanner.NewClient(ctx, Database)
if err != nil {
return fmt.Errorf("failed to create client: %w", err)
}
start := time.Now()
err = client.Single().Query(ctx, spanner.Statement{SQL: `SELECT 1`}).Do(
func(row *spanner.Row) error {
var v int64
return row.Columns(&v)
})
if err != nil {
return fmt.Errorf("failed to query: %w", err)
}
finish := time.Now()
client.Close()
if k%100 == 0 { // just a few random samples
fmt.Printf("%v%% last:%v\n", k*100/N, finish.Sub(start))
}
return nil
})
}
if err := group.Wait(); err != nil {
return err
}
count, err := fetchSessionCount(ctx, Database)
if err != nil {
return fmt.Errorf("failed to fetch count: %w", err)
}
fmt.Printf("SESSIONS %v\n", count)
return nil
}
func fetchSessionCount(ctx context.Context, database string) (int64, error) {
client, err := spannerdirect.NewClient(ctx,
option.WithEndpoint(os.Getenv("SPANNER_EMULATOR_HOST")),
option.WithGRPCDialOption(grpc.WithInsecure()),
option.WithoutAuthentication(),
option.WithGRPCDialOption(grpc.WithBlock()),
internaloption.SkipDialSettingsValidation(),
)
if err != nil {
return 0, fmt.Errorf("failed to create monitoring client: %w", err)
}
defer func() {
err = errors.Join(err, client.Close())
}()
count, err := countSessions(ctx, client, database)
if err != nil {
return 0, fmt.Errorf("count sessions failed: %w", err)
}
return count, nil
}
func countSessions(ctx context.Context, client *spannerdirect.Client, database string) (int64, error) {
var count int64
req := &spannerpb.ListSessionsRequest{
Database: database,
PageSize: 1e6,
PageToken: "", // TODO: handle pagination
}
it := client.ListSessions(ctx, req)
for {
resp, err := it.Next()
if err == iterator.Done {
return count, nil
}
if err != nil {
return count, fmt.Errorf("failed to iterate: %w", err)
}
if false {
fmt.Println(resp.Name)
}
count++
}
}
Expected behavior
After client.Close returns, there are no sessions left open. This assumes normal operations, of course.
Actual behavior
100 new sessions open after client.Close().
Screenshots
Plot of time taken for SELECT 1 over time, where each query creates a new client.
@egonelbre It was decided before that client libraries won't do the session cleanup or(DeleteSession RPC calls), so if a session is idle for an hour in backend it should be done automatically.
https://github.com/googleapis/google-cloud-go/pull/6679 cc: @olavloite
With multiplexed sessions this wouldn't help anymore, because they won't be deleted in spanner emulator https://github.com/GoogleCloudPlatform/cloud-spanner-emulator/blob/master/frontend/collections/session_manager.cc#L117