Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

spanner: sessions are not being closed when client is closed #10378

Open
egonelbre opened this issue Jun 13, 2024 · 1 comment
Open

spanner: sessions are not being closed when client is closed #10378

egonelbre opened this issue Jun 13, 2024 · 1 comment
Assignees
Labels
api: spanner Issues related to the Spanner API. triage me I really want to be triaged.

Comments

@egonelbre
Copy link
Contributor

egonelbre commented Jun 13, 2024

I was debugging GoogleCloudPlatform/cloud-spanner-emulator#159 problem and found out that using spanner.Client leaves about 100 sessions open per client. This in test environments can lead to a situation where creating 1000 clients ands up creating 80k sessions. This slows down trivial queries with new clients to 1s+.

It seems that client.Close does not close the batch created sessions.

Code

Reproducer
package main

import (
	"context"
	"errors"
	"fmt"
	"os"
	"os/signal"
	"time"

	"cloud.google.com/go/spanner"
	database "cloud.google.com/go/spanner/admin/database/apiv1"
	"cloud.google.com/go/spanner/admin/database/apiv1/databasepb"
	instance "cloud.google.com/go/spanner/admin/instance/apiv1"
	"cloud.google.com/go/spanner/admin/instance/apiv1/instancepb"
	spannerdirect "cloud.google.com/go/spanner/apiv1"
	"cloud.google.com/go/spanner/apiv1/spannerpb"
	"golang.org/x/sync/errgroup"
	"google.golang.org/api/iterator"
	"google.golang.org/api/option"
	"google.golang.org/api/option/internaloption"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
)

func main() {
	err := run()
	if err != nil {
		fmt.Fprint(os.Stderr, err)
		os.Exit(1)
	}
}

func run() error {
	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
	defer cancel()

	const (
		ProjectID, InstanceID, DatabaseID = "proj", "inst", "db"

		Database = "projects/" + ProjectID + "/instances/" + InstanceID + "/databases/" + DatabaseID
	)

	{ // create a new instance
		admin, err := instance.NewInstanceAdminClient(ctx)
		if err != nil {
			return fmt.Errorf("failed to create instance admin: %w", err)
		}
		op, err := admin.CreateInstance(ctx, &instancepb.CreateInstanceRequest{
			Parent:     "projects/" + ProjectID,
			InstanceId: InstanceID,
		})
		if err != nil {
			if spanner.ErrCode(err) == codes.AlreadyExists {
				goto instanceReady
			}
			return fmt.Errorf("create instance failed: %w", err)
		}
		if _, err := op.Wait(ctx); err != nil {
			return fmt.Errorf("failed to wait instance creation: %w", err)
		}

	instanceReady:
		if err := admin.Close(); err != nil {
			return fmt.Errorf("failed to close instance admin: %w", err)
		}
	}

	{ // create a database
		admin, err := database.NewDatabaseAdminClient(ctx)
		if err != nil {
			return fmt.Errorf("failed to create database admin: %w", err)
		}

		ddl, err := admin.CreateDatabase(ctx, &databasepb.CreateDatabaseRequest{
			Parent:          "projects/" + ProjectID + "/instances/" + InstanceID,
			DatabaseDialect: databasepb.DatabaseDialect_GOOGLE_STANDARD_SQL,
			CreateStatement: "CREATE DATABASE " + DatabaseID,
			ExtraStatements: []string{},
		})
		if err != nil {
			if spanner.ErrCode(err) == codes.AlreadyExists {
				goto databaseReady
			}
			return fmt.Errorf("failed to create database: %w", err)
		}

		if _, err := ddl.Wait(ctx); err != nil {
			return fmt.Errorf("failed to wait database creation: %w", err)
		}

	databaseReady:
		if err := admin.Close(); err != nil {
			return fmt.Errorf("failed to close database admin: %w", err)
		}
	}

	const N = 1000

	var group errgroup.Group
	group.SetLimit(128)

	for k := 0; k < N; k++ {
		if ctx.Err() != nil {
			break
		}

		k := k
		group.Go(func() error {
			client, err := spanner.NewClient(ctx, Database)
			if err != nil {
				return fmt.Errorf("failed to create client: %w", err)
			}

			start := time.Now()
			err = client.Single().Query(ctx, spanner.Statement{SQL: `SELECT 1`}).Do(
				func(row *spanner.Row) error {
					var v int64
					return row.Columns(&v)
				})
			if err != nil {
				return fmt.Errorf("failed to query: %w", err)
			}
			finish := time.Now()

			client.Close()

			if k%100 == 0 { // just a few random samples
				fmt.Printf("%v%%  last:%v\n", k*100/N, finish.Sub(start))
			}

			return nil
		})
	}

	if err := group.Wait(); err != nil {
		return err
	}

	count, err := fetchSessionCount(ctx, Database)
	if err != nil {
		return fmt.Errorf("failed to fetch count: %w", err)
	}

	fmt.Printf("SESSIONS %v\n", count)

	return nil
}

func fetchSessionCount(ctx context.Context, database string) (int64, error) {
	client, err := spannerdirect.NewClient(ctx,
		option.WithEndpoint(os.Getenv("SPANNER_EMULATOR_HOST")),
		option.WithGRPCDialOption(grpc.WithInsecure()),
		option.WithoutAuthentication(),
		option.WithGRPCDialOption(grpc.WithBlock()),
		internaloption.SkipDialSettingsValidation(),
	)
	if err != nil {
		return 0, fmt.Errorf("failed to create monitoring client: %w", err)
	}
	defer func() {
		err = errors.Join(err, client.Close())
	}()

	count, err := countSessions(ctx, client, database)
	if err != nil {
		return 0, fmt.Errorf("count sessions failed: %w", err)
	}

	return count, nil
}

func countSessions(ctx context.Context, client *spannerdirect.Client, database string) (int64, error) {
	var count int64
	req := &spannerpb.ListSessionsRequest{
		Database:  database,
		PageSize:  1e6,
		PageToken: "", // TODO: handle pagination
	}

	it := client.ListSessions(ctx, req)
	for {
		resp, err := it.Next()
		if err == iterator.Done {
			return count, nil
		}
		if err != nil {
			return count, fmt.Errorf("failed to iterate: %w", err)
		}

		if false {
			fmt.Println(resp.Name)
		}

		count++
	}
}

Expected behavior

After client.Close returns, there are no sessions left open. This assumes normal operations, of course.

Actual behavior

100 new sessions open after client.Close().

Screenshots

Plot of time taken for SELECT 1 over time, where each query creates a new client.

image

@egonelbre egonelbre added the triage me I really want to be triaged. label Jun 13, 2024
@product-auto-label product-auto-label bot added the api: spanner Issues related to the Spanner API. label Jun 13, 2024
@rahul2393
Copy link
Contributor

@egonelbre It was decided before that client libraries won't do the session cleanup or(DeleteSession RPC calls), so if a session is idle for an hour in backend it should be done automatically.

#6679
cc: @olavloite

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
api: spanner Issues related to the Spanner API. triage me I really want to be triaged.
Projects
None yet
Development

No branches or pull requests

3 participants